Intel® Fortran Compiler
Build applications that can scale for the future with optimized code designed for Intel® Xeon® and compatible processors.
27431 Discussions

Poor openmp scaling with ifort but not gfortran


I’ve been seeing some strange openmp scaling behavior that I’m not sure how to explain.
I have a simple test program that scales nicely when compiled with gfortran but poorly when compiled with ifort.




My test program is the following:

module parserMod
  use function_parser, only : fparser_array
  implicit none

  type(fparser_array), save :: parser
  !$omp threadprivate(parser)

end module parserMod
subroutine parallelMarbles(marbles, numThreads)

  use parserMod, only : parser
  use iso_fortran_env, only: wp => real64
  use iso_fortran_env, only : output_unit

  real(wp), dimension(6,200000), intent(inout) :: marbles
  integer, intent(in)                     :: numThreads

  integer :: indx
  character(len=1), dimension(3), parameter :: parserVars = ['x', 'y', 'z']

  ! All threads initialize the parser
  !$omp parallel num_threads(numThreads)
  call parser%parse(parserVars,parserVars)
  if (parser%error()) then
    call parser%print_errors(output_unit)
    stop 99
  !$omp end parallel

  !$omp parallel do default(none) &
  !$omp private(indx) &
  !$omp shared(marbles) &
  !$omp num_threads(numThreads)
  do indx = 1, size(marbles(1, :))    
      marbles(1,indx) = 1
      call doWork(marbles(:,indx))
  end do
  !$omp end parallel do

end subroutine parallelMarbles
subroutine doWork(marble)
  use omp_lib, only : omp_get_thread_num
  use parserMod, only : parser
  use iso_fortran_env, only: wp => real64
  implicit none
  real(wp), dimension(6), intent(inout) :: marble

  integer :: indx
  do indx = 1, 200
    marble(2) = mod(indx, 6 + omp_get_thread_num())*marble(1)
    marble(3) = mod(indx, 5 + omp_get_thread_num())*marble(1)
    marble(4) = mod(indx, 4 + omp_get_thread_num())*marble(1)

    call parser%evaluate(marble(1:3), marble(4:6))
    marble(1) = sum(marble(2:))
  end do

end subroutine doWork
program testOMP
  use iso_fortran_env, only: wp => real64

  ! real(wp), allocatable, dimension(:,:)    :: marbles
  real(wp), dimension(6,200000)    :: marbles
  integer                                  :: numThreads
  real                                     :: singleTime, threadTime

  integer :: startTime, endTime, countRate, countMax
  character(len=25)   :: varString

  ! allocate(marbles(6,200000))
  do numThreads = 1, 4, 3
    write(*,*) 'Calling parallel marbles with ', numThreads, ' threads.'  

    call system_clock(startTime, countRate, countMax)
    call parallelMarbles(marbles, numThreads)
    call system_clock(endTime)

    threadTime = (dble(endTime) - dble(startTime))/dble(countRate)

    write (varString, '(F25.6)') threadTime
    write (*, '(A)') ' Loop time = ' // trim(adjustl(varString)) // ' seconds.'

    if (numThreads .eq. 1) then
      singleTime = threadTime

    write (varString, '(F25.6)') singleTime / threadTime
     write (*, '(A)') ' Speedup = ' // trim(adjustl(varString)) // 'x.'

    write(*,*) '------------------------------------------------------'
  end do
end program testOMP

 My code uses the fortran_function_parser module from:


I've attached the source files for the test program along with the fortran function parser module for convenience.


Any insight into what might be going wrong in ifort and how I might improve the performance with ifort would be greatly appreciated.


Labels (2)
0 Replies