<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic clEnqueueReadBuffer sometimes too slow(2second) in OpenCL* for CPU</title>
    <link>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145475#M5976</link>
    <description>&lt;P&gt;clEnqueueReadBuffer sometimes too slow(2second)&lt;/P&gt;

&lt;P&gt;Hello, I'm making real-time application but sometimes clEnqueueReadBuffer is too slow.&lt;BR /&gt;
	I tested clEnqueueReadBuffer with attached code. But I don't know why it is slow. Please help.&lt;/P&gt;

&lt;P&gt;this is my environment.&lt;/P&gt;

&lt;P&gt;OS: Windows 10 Pro 64 bits&lt;BR /&gt;
	CPU: Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz&lt;BR /&gt;
	GPU : Intel(R) HD Graphics 630&lt;BR /&gt;
	OpenCL: 1.2 version.&lt;BR /&gt;
	Intel OpenCL SDK:&lt;BR /&gt;
	Version=6.3.0.1904&lt;BR /&gt;
	InternalVersion=dkdnfngdfkjndfkjgndfndfgk&lt;BR /&gt;
	Visual Studio Professional 2015.&lt;/P&gt;

&lt;P&gt;below is my test code. full project is attached.&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;	cl_mem					d_buf;
	unsigned char *			h_in;
	unsigned char *			h_out;
	int						byte;
	
	unsigned long			tick_start;
	unsigned long			tick_end;
	int						idx;


	// initialize
	byte	= 4096;
	
	h_in	= new	unsigned char[4096];
	h_out	= new	unsigned char[4096];
	d_buf	= clCreateBuffer( ocl.context, CL_MEM_READ_WRITE, byte, NULL, &amp;amp;err );
	
	::memset( h_in, 0, byte );
	err = clEnqueueWriteBuffer( ocl.commandQueue, d_buf, CL_TRUE, 0, byte, h_in, 0,	NULL,NULL );
	if ( CL_SUCCESS != err ) {
		printf( "WriteError %d \r\n", err );
		DebugBreak();
	}


	// main loop
	for ( idx = 0 ; idx &amp;lt;= 500000 ; idx ++ ) { 
		tick_start = ::GetTickCount();
	
		err = clEnqueueReadBuffer( ocl.commandQueue, d_buf,	CL_TRUE, 0, byte, h_out, 0,	NULL, NULL );
		if ( CL_SUCCESS != err ) {
			printf( "ReadError %d \r\n", err );
			DebugBreak();
		}

		tick_end = ::GetTickCount();
		
		// for check progress
		if ( idx %10000 == 0 ) {
			printf( "idx: %d \r\n", idx );
		}

		// for check large delay
		if ( tick_end - tick_start &amp;gt; 100 ) {
			printf( "idx: %d, Elapsed: %d ms \r\n", idx,  (int)(tick_end - tick_start));
		}
	}

	// rlease memory
	clReleaseMemObject( d_buf );
	delete [] h_in;
	delete [] h_out;&lt;/PRE&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Tue, 24 Oct 2017 02:28:58 GMT</pubDate>
    <dc:creator>Hschoi_C_</dc:creator>
    <dc:date>2017-10-24T02:28:58Z</dc:date>
    <item>
      <title>clEnqueueReadBuffer sometimes too slow(2second)</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145475#M5976</link>
      <description>&lt;P&gt;clEnqueueReadBuffer sometimes too slow(2second)&lt;/P&gt;

&lt;P&gt;Hello, I'm making real-time application but sometimes clEnqueueReadBuffer is too slow.&lt;BR /&gt;
	I tested clEnqueueReadBuffer with attached code. But I don't know why it is slow. Please help.&lt;/P&gt;

&lt;P&gt;this is my environment.&lt;/P&gt;

&lt;P&gt;OS: Windows 10 Pro 64 bits&lt;BR /&gt;
	CPU: Intel(R) Core(TM) i7-7700 CPU @ 3.60GHz&lt;BR /&gt;
	GPU : Intel(R) HD Graphics 630&lt;BR /&gt;
	OpenCL: 1.2 version.&lt;BR /&gt;
	Intel OpenCL SDK:&lt;BR /&gt;
	Version=6.3.0.1904&lt;BR /&gt;
	InternalVersion=dkdnfngdfkjndfkjgndfndfgk&lt;BR /&gt;
	Visual Studio Professional 2015.&lt;/P&gt;

&lt;P&gt;below is my test code. full project is attached.&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;	cl_mem					d_buf;
	unsigned char *			h_in;
	unsigned char *			h_out;
	int						byte;
	
	unsigned long			tick_start;
	unsigned long			tick_end;
	int						idx;


	// initialize
	byte	= 4096;
	
	h_in	= new	unsigned char[4096];
	h_out	= new	unsigned char[4096];
	d_buf	= clCreateBuffer( ocl.context, CL_MEM_READ_WRITE, byte, NULL, &amp;amp;err );
	
	::memset( h_in, 0, byte );
	err = clEnqueueWriteBuffer( ocl.commandQueue, d_buf, CL_TRUE, 0, byte, h_in, 0,	NULL,NULL );
	if ( CL_SUCCESS != err ) {
		printf( "WriteError %d \r\n", err );
		DebugBreak();
	}


	// main loop
	for ( idx = 0 ; idx &amp;lt;= 500000 ; idx ++ ) { 
		tick_start = ::GetTickCount();
	
		err = clEnqueueReadBuffer( ocl.commandQueue, d_buf,	CL_TRUE, 0, byte, h_out, 0,	NULL, NULL );
		if ( CL_SUCCESS != err ) {
			printf( "ReadError %d \r\n", err );
			DebugBreak();
		}

		tick_end = ::GetTickCount();
		
		// for check progress
		if ( idx %10000 == 0 ) {
			printf( "idx: %d \r\n", idx );
		}

		// for check large delay
		if ( tick_end - tick_start &amp;gt; 100 ) {
			printf( "idx: %d, Elapsed: %d ms \r\n", idx,  (int)(tick_end - tick_start));
		}
	}

	// rlease memory
	clReleaseMemObject( d_buf );
	delete [] h_in;
	delete [] h_out;&lt;/PRE&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 24 Oct 2017 02:28:58 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145475#M5976</guid>
      <dc:creator>Hschoi_C_</dc:creator>
      <dc:date>2017-10-24T02:28:58Z</dc:date>
    </item>
    <item>
      <title>Since you're measuring wall</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145476#M5977</link>
      <description>&lt;P&gt;Since you're measuring wall clock time, is it possible that every so often your application gets swapped out for something else?&amp;nbsp; This would show up as a large delay, even though the majority of the time was spent in a different process.&amp;nbsp; If this is the case then increasing the priority of your application might eliminate some of the delays, but perhaps not all.&lt;/P&gt;

&lt;P&gt;It would be interesting to enable event profiling to measure the exact cost of clEnqueueReadBuffer(), by checking the delta between CL_PROFILING_COMMAND_END and CL_PROFILING_COMMAND_START.&amp;nbsp; I suspect it will remain consistent even if the wall clock time is varying.&lt;/P&gt;</description>
      <pubDate>Tue, 24 Oct 2017 15:08:13 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145476#M5977</guid>
      <dc:creator>Ben_A_Intel</dc:creator>
      <dc:date>2017-10-24T15:08:13Z</dc:date>
    </item>
    <item>
      <title>Thank you. Ben Asbaugh. </title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145477#M5978</link>
      <description>&lt;P&gt;Thank you. Ben Asbaugh.&amp;nbsp;&lt;/P&gt;

&lt;P&gt;&lt;STRONG&gt;The time between&amp;nbsp;&lt;/STRONG&gt;&lt;SPAN style="font-size: 12px;"&gt;&lt;STRONG&gt;CL_PROFILING_COMMAND_END -&amp;nbsp;CL_PROFILING_COMMAND_START was 3 us&lt;/STRONG&gt; while elapsed time using wall clock is 2 second.&lt;/SPAN&gt;&lt;/P&gt;

&lt;P&gt;I checked other times.&lt;BR /&gt;
	submit&amp;nbsp; &amp;nbsp;- queued:&amp;nbsp; 3.50 us&lt;BR /&gt;
	start&amp;nbsp; &amp;nbsp; - submit: 91.75 us&lt;BR /&gt;
	end&amp;nbsp; &amp;nbsp; &amp;nbsp; -&amp;nbsp; start:&amp;nbsp; 3.00 us&lt;BR /&gt;
	complete -&amp;nbsp; &amp;nbsp; end:&amp;nbsp; 0.00 us&lt;/P&gt;

&lt;P&gt;I also using visual studio attached nsight profiler.&amp;nbsp;&lt;BR /&gt;
	It shows that memory copy is completed but clEnqueueReadBuffer end after 2 second.&lt;BR /&gt;
	&lt;STRONG&gt;How can I check the delay is swap out or something else?&lt;/STRONG&gt;&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 1em;"&gt;Below is my profiling code.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;

&lt;PRE class="brush:cpp;"&gt;	// main loop
	for ( idx = 0 ; idx &amp;lt;= 500000 ; idx ++ ) { 
	
		tick_start = ::GetTickCount();

		err = clEnqueueReadBuffer( ocl.commandQueue, d_buf,	CL_TRUE, 0, byte, h_out, 0,	NULL, &amp;amp;prof_event );
		if ( CL_SUCCESS != err ) {
			printf( "ReadError %d \r\n", err );
			DebugBreak();
		}
		
		tick_end = ::GetTickCount();

		// wait event
		err = clWaitForEvents( 1, &amp;amp;prof_event );
		if ( CL_SUCCESS != err ) {
			printf( "WaitError %d \r\n", err );
			DebugBreak();
		}

		// for check progress
		if ( idx %10000 == 0 ) {
			printf( "idx: %d \r\n", idx );
		}

		// for check large delay
		if ( tick_end - tick_start &amp;gt; 100 ) {
			// get event profiling
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &amp;amp;queued_time, &amp;amp;return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError1 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &amp;amp;submit_time, &amp;amp;return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError2 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &amp;amp;start_time, &amp;amp;return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError3 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &amp;amp;end_time, &amp;amp;return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError4 %d \r\n", err );
				DebugBreak();
			}
			err = clGetEventProfilingInfo( prof_event, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &amp;amp;complete_time, &amp;amp;return_bytes);
			if ( CL_SUCCESS != err ) {
				printf( "ProfError5 %d \r\n", err );
				DebugBreak();
			}


			printf( "idx: %d, Wall clock Elapsed: %d ms \r\n", idx, (int)(tick_end - tick_start) );
			printf( "Submit   - Queued: %.2f us \r\n", (double)( (submit_time	- queued_time) / 1000.0 ) );
			printf( "Start    - Submit: %.2f us \r\n", (double)( (start_time	- submit_time) / 1000.0 ) );
			printf( "End      - Start : %.2f us \r\n", (double)( (end_time		- start_time ) / 1000.0 ) );
			printf( "Complete - End   : %.2f us \r\n", (double)( (complete_time	- end_time	 ) / 1000.0 ) );
		}
		
		clReleaseEvent( prof_event );
	}&lt;/PRE&gt;

&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 25 Oct 2017 02:21:45 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145477#M5978</guid>
      <dc:creator>Hschoi_C_</dc:creator>
      <dc:date>2017-10-25T02:21:45Z</dc:date>
    </item>
    <item>
      <title>Solved.</title>
      <link>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145478#M5979</link>
      <description>&lt;P&gt;Solved.&lt;/P&gt;

&lt;P&gt;&lt;SPAN style="font-size: 1em;"&gt;I updated &lt;/SPAN&gt;intel graphic driver from&amp;nbsp;&lt;SPAN style="font-size: 1em;"&gt;21.20.16.4678 to&amp;nbsp;&lt;/SPAN&gt;22.20.16.4815.&lt;/P&gt;</description>
      <pubDate>Wed, 25 Oct 2017 07:42:53 GMT</pubDate>
      <guid>https://community.intel.com/t5/OpenCL-for-CPU/clEnqueueReadBuffer-sometimes-too-slow-2second/m-p/1145478#M5979</guid>
      <dc:creator>Hschoi_C_</dc:creator>
      <dc:date>2017-10-25T07:42:53Z</dc:date>
    </item>
  </channel>
</rss>

