import pyopencl as cl ctx = cl.create_some_context() src = r"""//CL// #define local_barrier() barrier(CLK_LOCAL_MEM_FENCE); #define WITHIN_KERNEL /* empty */ #define KERNEL __kernel #define GLOBAL_MEM __global #define LOCAL_MEM __local #define LOCAL_MEM_ARG __local #define REQD_WG_SIZE(X,Y,psc_Z) __attribute__((reqd_work_group_size(X, Y, psc_Z))) #define psc_LID_0 get_local_id(0) #define psc_LID_1 get_local_id(1) #define psc_LID_2 get_local_id(2) #define psc_GID_0 get_group_id(0) #define psc_GID_1 get_group_id(1) #define psc_GID_2 get_group_id(2) #define psc_LDIM_0 get_local_size(0) #define psc_LDIM_1 get_local_size(1) #define psc_LDIM_2 get_local_size(2) #define psc_GDIM_0 get_num_groups(0) #define psc_GDIM_1 get_num_groups(1) #define psc_GDIM_2 get_num_groups(2) #pragma OPENCL EXTENSION cl_khr_fp64: enable //CL// #define psc_WG_SIZE 16 #define psc_SCAN_EXPR(a, b, across_seg_boundary) a+b #define psc_INPUT_EXPR(i) (input_ary[i]) typedef int psc_scan_type; typedef int psc_index_type; // NO_SEG_BOUNDARY is the largest representable integer in psc_index_type. // This assumption is used in code below. #define NO_SEG_BOUNDARY 2147483647 //CL// #define psc_K 64 KERNEL REQD_WG_SIZE(psc_WG_SIZE, 1, 1) void scan_scan_intervals( __global int *input_ary, __global int *output_ary, GLOBAL_MEM psc_scan_type *psc_partial_scan_buffer, const psc_index_type N, const psc_index_type psc_interval_size , GLOBAL_MEM psc_scan_type *psc_interval_results ) { // padded in psc_WG_SIZE to avoid bank conflicts // index psc_K in first dimension used for psc_carry storage LOCAL_MEM psc_scan_type psc_ldata[psc_K + 1][psc_WG_SIZE + 1]; const psc_index_type psc_interval_begin = psc_interval_size * psc_GID_0; const psc_index_type psc_interval_end = min(psc_interval_begin + psc_interval_size, N); const psc_index_type psc_unit_size = psc_K * psc_WG_SIZE; psc_index_type psc_unit_base = psc_interval_begin; for(; psc_unit_base + psc_unit_size <= psc_interval_end; psc_unit_base += psc_unit_size) { // {{{ read a unit's worth of data from psc_global for(psc_index_type psc_k = 0; psc_k < psc_K; psc_k++) { const psc_index_type psc_offset = psc_k*psc_WG_SIZE + psc_LID_0; const psc_index_type psc_read_i = psc_unit_base + psc_offset; { psc_scan_type psc_scan_value = psc_INPUT_EXPR(psc_read_i); const psc_index_type psc_o_mod_k = psc_offset % psc_K; const psc_index_type psc_o_div_k = psc_offset / psc_K; psc_ldata[psc_o_mod_k][psc_offset / psc_K] = psc_scan_value; } } // }}} // {{{ psc_carry in from previous unit, if applicable if (psc_LID_0 == 0 && psc_unit_base != psc_interval_begin) { psc_ldata[0][0] = psc_SCAN_EXPR(psc_ldata[psc_K][psc_WG_SIZE - 1], psc_ldata[0][0], false ); } // }}} local_barrier(); // {{{ scan along psc_k (sequentially in each work item) psc_scan_type psc_sum = psc_ldata[0][psc_LID_0]; for(psc_index_type psc_k = 1; psc_k < psc_K; psc_k++) { { psc_scan_type psc_tmp = psc_ldata[psc_k][psc_LID_0]; psc_index_type psc_seq_i = psc_unit_base + psc_K*psc_LID_0 + psc_k; psc_sum = psc_SCAN_EXPR(psc_sum, psc_tmp, false ); psc_ldata[psc_k][psc_LID_0] = psc_sum; } } // }}} // store psc_carry in out-of-bounds (padding) array entry (index psc_K) in the psc_K direction psc_ldata[psc_K][psc_LID_0] = psc_sum; local_barrier(); } if (psc_unit_base < psc_interval_end) { // {{{ psc_carry out input_fetch_exprs // (if there are ones that need to be fetched into local) // }}} // {{{ read a unit's worth of data from psc_global for(psc_index_type psc_k = 0; psc_k < psc_K; psc_k++) { const psc_index_type psc_offset = psc_k*psc_WG_SIZE + psc_LID_0; const psc_index_type psc_read_i = psc_unit_base + psc_offset; if (psc_read_i < psc_interval_end) { psc_scan_type psc_scan_value = psc_INPUT_EXPR(psc_read_i); const psc_index_type psc_o_mod_k = psc_offset % psc_K; const psc_index_type psc_o_div_k = psc_offset / psc_K; psc_ldata[psc_o_mod_k][psc_offset / psc_K] = psc_scan_value; } } // }}} // {{{ psc_carry in from previous unit, if applicable if (psc_LID_0 == 0 && psc_unit_base != psc_interval_begin) { psc_ldata[0][0] = psc_SCAN_EXPR(psc_ldata[psc_K][psc_WG_SIZE - 1], psc_ldata[0][0], false ); } // }}} local_barrier(); // {{{ scan along psc_k (sequentially in each work item) psc_scan_type psc_sum = psc_ldata[0][psc_LID_0]; const psc_index_type psc_offset_end = psc_interval_end - psc_unit_base; for(psc_index_type psc_k = 1; psc_k < psc_K; psc_k++) { if (psc_K * psc_LID_0 + psc_k < psc_offset_end) { psc_scan_type psc_tmp = psc_ldata[psc_k][psc_LID_0]; psc_index_type psc_seq_i = psc_unit_base + psc_K*psc_LID_0 + psc_k; psc_sum = psc_SCAN_EXPR(psc_sum, psc_tmp, false ); psc_ldata[psc_k][psc_LID_0] = psc_sum; } } // }}} // store psc_carry in out-of-bounds (padding) array entry (index psc_K) in the psc_K direction psc_ldata[psc_K][psc_LID_0] = psc_sum; local_barrier(); } // write interval psc_sum if (psc_LID_0 == 0) { psc_interval_results[psc_GID_0] = psc_partial_scan_buffer[psc_interval_end - 1]; } } """ cl.Program(ctx, src).build() # vim: filetype=pyopencl