    ! own mat-vec multiplication
      module mlt
      contains
      subroutine matvec( n, blksize, x, y, a, ja, ia )
      
      real*8  :: x(:), y(:), a(:)
      integer :: n, blksize, ja(:), ia(:)
      real*8  :: t, tn(blksize)
      integer :: i, j, k, iv, jv, ks, iblk, jblk

      if ( blksize==1 ) then
         ! CSR
         do i = 1, n
            t = 0.0d0
            do k = ia(i), ia(i+1)-1
               t = t + a(k)*x(ja(k))
            enddo
            y(i) = t
         enddo

      else
         ! BSR
         do i = 1, n
            iv = (i-1)*blksize
            tn(:) = 0.0d0
            do k = ia(i), ia(i+1)-1
               j = ja(k)
               ks = (k-1)*blksize*blksize
               jv = (j-1)*blksize
               do iblk = 1, blksize
                  do jblk = 1, blksize
                     tn(iblk) = tn(iblk) + a(ks+(iblk-1)*blksize+jblk)*x(jv+jblk)
                  enddo
               enddo
            enddo
            do iblk = 1, blksize
               y(iv+iblk) = tn(iblk)
            enddo
         enddo

      endif

      return
      end subroutine matvec
      end module mlt






      program test_random_number
      use mlt
      implicit none
      integer i,k
      integer,parameter   :: blksize=3
      integer,parameter   :: dimx=10 !rows (of blocks)
      integer,parameter   :: dimy=15 !cols
      integer,parameter   :: dimz=85 !layers
      real*8              :: x(blksize*dimx*dimy*dimz)
      real*8              :: y1(blksize*dimx*dimy*dimz)
      real*8              :: y2(blksize*dimx*dimy*dimz)
      integer*8           :: seed = 1067
      integer*8           :: nrange = 30000
      integer*8           :: reps = 100000
      real*8              :: random_uniform
      integer             :: cnt,iblk,jblk,cnt_blk,cnt_row
      integer             :: n_nonzeroblocks
      real*8,allocatable,dimension(:)  :: a(:)
      integer,allocatable,dimension(:) :: columns(:),columns1(:)
      integer,allocatable,dimension(:) :: rowIndex(:), rowIndex1(:)
      real*8              :: sumdiff,test
      integer             :: cnt_rate, t0, t1

      n_nonzeroblocks=dimx*dimy*dimz*6-dimx*dimy*2-dimz*dimy*2-dimz*dimx*2
      write (*,*) n_nonzeroblocks,'nonzero blocks',',block size',blksize
      allocate(a(n_nonzeroblocks*blksize*blksize))
      allocate(columns(n_nonzeroblocks))
      allocate(rowIndex(dimx*dimy*dimz+1))
      allocate(columns1(n_nonzeroblocks))
      allocate(rowIndex1(dimx*dimy*dimz+1))
      rowIndex(1)=1
      cnt_blk=1
      cnt=1

      !generate csr/bsr based on neighborhood in the Cartesian grid
      do iblk = 1,dimx*dimy*dimz
         cnt_row=0
         do jblk=1,dimx*dimy*dimz
            if (    ((jblk == iblk-1  .or. jblk == iblk+1) .and. (iblk-1)/dimy == (jblk-1)/dimy)  &  ! left and right nbr and on the same line
     &         .or. ((jblk == iblk-dimy .or. jblk == iblk+dimy) .and. (iblk-1)/(dimx*dimy) == (jblk-1)/(dimx*dimy) )&  ! upper and lower nbr and on the same surface
     &         .or. jblk == iblk-dimx*dimy .or. jblk == iblk+dimx*dimy) then
               cnt_row=cnt_row+1
               columns(cnt_blk)=jblk
               cnt_blk=cnt_blk+1
               test=random_uniform(seed)
               do k=1,blksize*blksize
                  a(cnt)=test!random_uniform(seed)
                  cnt=cnt+1
               enddo
            endif
         enddo
         rowIndex(iblk+1)=rowIndex(iblk)+cnt_row
      enddo
      do i = 1,size(x)
         x(i)=1+floor(random_uniform(seed)*nrange)-0.1
      enddo

      rowIndex1(:)=rowIndex(:)-1
      columns1(:)=columns(:)-1

      call system_clock(count_rate=cnt_rate)
      call system_clock(count=t0)
      do i=1,reps
         if (blksize==1) then
            call mkl_dcsrgemv('N',dimx*dimy*dimz,a,rowIndex,columns,x,y1)
         else
            call mkl_cspblas_dbsrgemv('N',dimx*dimy*dimz,blksize,a,rowIndex1,columns1,x,y1)
            !call mkl_dbsrgemv('N',dimx*dimy*dimz,blksize,a,rowIndex,columns,x,y1)
         endif
      enddo
      call system_clock(count=t1)
      write (*,*) 'mkl ',(t1-t0)/real(cnt_rate)
      call system_clock(count=t0)
      do i=1,reps
         call matvec(dimx*dimy*dimz,blksize,x,y2,a,columns,rowIndex)
      enddo
      call system_clock(count=t1)
      write (*,*) 'generic took',(t1-t0)/real(cnt_rate)

      sumdiff=0.0d0
      do i = 1,size(y1)
         sumdiff = sumdiff+abs(y1(i)-y2(i))
      enddo
      write (*,*) 'difference between y1 and y2',sumdiff

      end program

      ! thanks, StackOverflow
      real*8 function random_uniform(seed)
      implicit none
      integer*8 seed
      seed = mod(7**5*seed, 2147483647)
      random_uniform = seed / 2147483647.
      end function
