OpenCL* for CPU
Ask questions and share information on Intel® SDK for OpenCL™ Applications and OpenCL™ implementations for Intel® CPU.
公告
This forum covers OpenCL* for CPU only. OpenCL* for GPU questions can be asked in the GPU Compute Software forum. Intel® FPGA SDK for OpenCL™ questions can be ask in the FPGA Intel® High Level Design forum.
1733 讨论

clEnqueueReadBuffer sometimes too slow(2second)

Hschoi_C_
初学者
3,383 次查看

clEnqueueReadBuffer sometimes too slow(2second)

Hello, I'm making real-time application but sometimes clEnqueueReadBuffer is too slow.
I tested clEnqueueReadBuffer with attached code. But I don't know why it is slow. Please help.

this is my environment.

OS: Windows 10 Pro 64 bits
CPU: Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz
GPU : Intel(R) HD Graphics 630
OpenCL: 1.2 version.
Intel OpenCL SDK:
Version=6.3.0.1904
InternalVersion=dkdnfngdfkjndfkjgndfndfgk
Visual Studio Professional 2015.

below is my test code. full project is attached.

	cl_mem					d_buf;
	unsigned char *			h_in;
	unsigned char *			h_out;
	int						byte;
	
	unsigned long			tick_start;
	unsigned long			tick_end;
	int						idx;


	// initialize
	byte	= 4096;
	
	h_in	= new	unsigned char[4096];
	h_out	= new	unsigned char[4096];
	d_buf	= clCreateBuffer( ocl.context, CL_MEM_READ_WRITE, byte, NULL, &err );
	
	::memset( h_in, 0, byte );
	err = clEnqueueWriteBuffer( ocl.commandQueue, d_buf, CL_TRUE, 0, byte, h_in, 0,	NULL,NULL );
	if ( CL_SUCCESS != err ) {
		printf( "WriteError %d \r\n", err );
		DebugBreak();
	}


	// main loop
	for ( idx = 0 ; idx <= 500000 ; idx ++ ) { 
		tick_start = ::GetTickCount();
	
		err = clEnqueueReadBuffer( ocl.commandQueue, d_buf,	CL_TRUE, 0, byte, h_out, 0,	NULL, NULL );
		if ( CL_SUCCESS != err ) {
			printf( "ReadError %d \r\n", err );
			DebugBreak();
		}

		tick_end = ::GetTickCount();
		
		// for check progress
		if ( idx %10000 == 0 ) {
			printf( "idx: %d \r\n", idx );
		}

		// for check large delay
		if ( tick_end - tick_start > 100 ) {
			printf( "idx: %d, Elapsed: %d ms \r\n", idx,  (int)(tick_end - tick_start));
		}
	}

	// rlease memory
	clReleaseMemObject( d_buf );
	delete [] h_in;
	delete [] h_out;

 

0 项奖励
1 解答
Ben_A_Intel
员工
3,383 次查看

Since you're measuring wall clock time, is it possible that every so often your application gets swapped out for something else?  This would show up as a large delay, even though the majority of the time was spent in a different process.  If this is the case then increasing the priority of your application might eliminate some of the delays, but perhaps not all.

It would be interesting to enable event profiling to measure the exact cost of clEnqueueReadBuffer(), by checking the delta between CL_PROFILING_COMMAND_END and CL_PROFILING_COMMAND_START.  I suspect it will remain consistent even if the wall clock time is varying.

在原帖中查看解决方案

0 项奖励
3 回复数
Ben_A_Intel
员工
3,384 次查看

Since you're measuring wall clock time, is it possible that every so often your application gets swapped out for something else?  This would show up as a large delay, even though the majority of the time was spent in a different process.  If this is the case then increasing the priority of your application might eliminate some of the delays, but perhaps not all.

It would be interesting to enable event profiling to measure the exact cost of clEnqueueReadBuffer(), by checking the delta between CL_PROFILING_COMMAND_END and CL_PROFILING_COMMAND_START.  I suspect it will remain consistent even if the wall clock time is varying.

0 项奖励
Hschoi_C_
初学者
3,383 次查看

Thank you. Ben Asbaugh. 

The time between CL_PROFILING_COMMAND_END - CL_PROFILING_COMMAND_START was 3 us while elapsed time using wall clock is 2 second.

I checked other times.
submit   - queued:  3.50 us
start    - submit: 91.75 us
end      -  start:  3.00 us
complete -    end:  0.00 us

I also using visual studio attached nsight profiler. 
It shows that memory copy is completed but clEnqueueReadBuffer end after 2 second.
How can I check the delay is swap out or something else?

Below is my profiling code. 

	// main loop
	for ( idx = 0 ; idx <= 500000 ; idx ++ ) { 
	
		tick_start = ::GetTickCount();

		err = clEnqueueReadBuffer( ocl.commandQueue, d_buf,	CL_TRUE, 0, byte, h_out, 0,	NULL, &prof_event );
		if ( CL_SUCCESS != err ) {
			printf( "ReadError %d \r\n", err );
			DebugBreak();
		}
		
		tick_end = ::GetTickCount();

		// wait event
		err = clWaitForEvents( 1, &prof_event );
		if ( CL_SUCCESS != err ) {
			printf( "WaitError %d \r\n", err );
			DebugBreak();
		}

		// for check progress
		if ( idx %10000 == 0 ) {
			printf( "idx: %d \r\n", idx );
		}

		// for check large delay
		if ( tick_end - tick_start > 100 ) {
			// get event profiling
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &queued_time, &return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError1 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &submit_time, &return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError2 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_time, &return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError3 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end_time, &return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError4 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &complete_time, &return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError5 %d \r\n", err );
				DebugBreak();
			}


			printf( "idx: %d, Wall clock Elapsed: %d ms \r\n", idx, (int)(tick_end - tick_start) );
			printf( "Submit   - Queued: %.2f us \r\n", (double)( (submit_time	- queued_time) / 1000.0 ) );
			printf( "Start    - Submit: %.2f us \r\n", (double)( (start_time	- submit_time) / 1000.0 ) );
			printf( "End      - Start : %.2f us \r\n", (double)( (end_time		- start_time ) / 1000.0 ) );
			printf( "Complete - End   : %.2f us \r\n", (double)( (complete_time	- end_time	 ) / 1000.0 ) );
		}
		
		clReleaseEvent( prof_event );
	}

 

0 项奖励
Hschoi_C_
初学者
3,383 次查看

Solved.

I updated intel graphic driver from 21.20.16.4678 to 22.20.16.4815.

0 项奖励
回复