- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Here is a Kernel module written to read and write the msr for Intel processors. The header file msrdrv.h is:
#include <linux/ioctl.h> #include <linux/types.h> #define DEV_NAME "msrdrv" #define DEV_MAJOR 223 #define DEV_MINOR 0 #define MSR_VEC_LIMIT 32 enum MsrOperation { MSR_NOP = 0, MSR_READ = 1, MSR_WRITE = 2, MSR_STOP = 3, MSR_RDTSC = 4 }; struct MsrInOut { unsigned int op; // MsrOperation unsigned int ecx; // msr identifier union msr_union { struct msr_struct{ unsigned int eax; // low double word unsigned int edx; // high double word }mstruct; unsigned long long value; // quad word }munion; };
The module file msrdrv.c:
#include <linux/module.h> // for all modules #include <linux/init.h> // for entry/exit macros #include <linux/kernel.h> // for printk priority macros #include "msrdrv.h" static long long read_msr(unsigned int ecx) { unsigned int edx = 0, eax = 0; unsigned long long result = 0; __asm__ __volatile__("rdmsr" : "=a"(eax), "=d"(edx) : "c"(ecx)); result = eax | (unsigned long long)edx << 0x20; printk(KERN_ALERT "Module msrdrv: Read 0x%016llx (0x%08x:0x%08x) from MSR 0x%08x\n", result, edx, eax, ecx); return result; } static void write_msr(int ecx, unsigned int eax, unsigned int edx) { printk(KERN_ALERT "Module msrdrv: Writing 0x%08x:0x%08x to MSR 0x%04x\n", edx, eax, ecx); __asm__ __volatile__("wrmsr" : : "c"(ecx), "a"(eax), "d"(edx)); } static long long read_tsc(void) { unsigned eax, edx; long long result; __asm__ __volatile__("rdtsc" : "=a"(eax), "=d"(edx)); result = eax | (unsigned long long)edx << 0x20; printk(KERN_ALERT "Module msrdrv: Read 0x%016llx (0x%08x:0x%08x) from TSC\n", result, edx, eax); return result; } static long msrdrv_test(unsigned long long ioctl_param, int n) { struct MsrInOut *msrops; int i; msrops = (struct MsrInOut*)ioctl_param; for (i = 0 ; i <= n ; i++) { switch (msrops.op) { case MSR_NOP: break; case MSR_STOP: goto label_end; case MSR_READ: msrops.munion.value = read_msr(msrops.ecx); break; case MSR_WRITE: write_msr(msrops.ecx, msrops.munion.mstruct.eax, msrops.munion.mstruct.edx); break; case MSR_RDTSC: msrops.munion.value = read_tsc(); break; default: return 1; } } label_end: return 0; } static int msrdrv_init(void) { int i=0; struct MsrInOut msr_start[] = { { MSR_WRITE, 0x38F, 0x00, 0x00 }, // ia32_perf_global_ctrl: disable 4 PMCs & 3 FFCs { MSR_WRITE, 0xc1, 0x00, 0x00 }, // ia32_pmc0: zero value (35-5) { MSR_WRITE, 0xc2, 0x00, 0x00 }, // ia32_pmc1: zero value (35-5) { MSR_WRITE, 0xc3, 0x00, 0x00 }, // ia32_pmc2: zero value (35-5) { MSR_WRITE, 0xc4, 0x00, 0x00 }, // ia32_pmc3: zero value (35-5) { MSR_WRITE, 0x186, 0x004101c2, 0x00 }, // ia32_perfevtsel1, UOPS_RETIRED.ALL (19-28) { MSR_WRITE, 0x187, 0x0041010e, 0x00 }, // ia32_perfevtsel0, UOPS_ISSUED.ANY (19.22) { MSR_WRITE, 0x188, 0x01c1010e, 0x00 }, // ia32_perfevtsel2, UOPS_ISSUED.ANY-stalls (19-22) { MSR_WRITE, 0x189, 0x004101a2, 0x00 }, // ia32_perfevtsel3, RESOURCE_STALLS.ANY (19-27) { MSR_WRITE, 0x38F, 0x0f, 0x07 }, // ia32_perf_global_ctrl: enable 4 PMCs & 3 FFCs { MSR_STOP, 0x00, 0x00 } }; struct MsrInOut msr_stop[] = { { MSR_WRITE, 0x38F, 0x00, 0x00 }, // ia32_perf_global_ctrl: disable 4 PMCs & 3 FFCs { MSR_READ, 0xc1, 0x00 ,0x00}, // ia32_pmc0: read value (35-5) { MSR_READ, 0xc2, 0x00 ,0x00}, // ia32_pmc1: read value (35-5) { MSR_READ, 0xc3, 0x00 ,0x00}, // ia32_pmc2: read value (35-5) { MSR_READ, 0xc4, 0x00 , 0x00 }, // ia32_pmc3: read value (35-5) { MSR_STOP, 0x00, 0x00 } }; printk(KERN_ALERT "Module " DEV_NAME " loaded\n"); msrdrv_test((unsigned long long)msr_start,11); printk("HELLO This is a hex number 0x%x\n", -1); while(i<10000){ i++; } msrdrv_test((unsigned long long)msr_stop,6); printk("uops retired: %7lld\n", msr_stop[1].munion.value); printk("uops issued: %7lld\n", msr_stop[2].munion.value); printk("stalled cycles: %7lld\n", msr_stop[3].munion.value); printk("resource stalls: %7lld\n", msr_stop[4].munion.value); return 0; } static void msrdrv_exit(void) { printk(KERN_ALERT "Module " DEV_NAME " unloaded\n"); return; } module_init(msrdrv_init); module_exit(msrdrv_exit);
The output should be the values of the counters.
The output is: uops retired : 0, uops issued : 0, stalled cycles: 15248, resource stalls : 0
.
The module is giving the output for the event stalled cycles only (counter c3). It is giving 0 values for the other three events. Why??
Pls help me to solve the issue for why it is only reading the stalled cycles only and not any other event
Link Copied
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
It is probably easier to test the functionality using the "rdmsr.c" and "wrmsr.c" command-line programs from msrtools-1.2. That way you can be sure that you have programmed the control registers correctly without the extra effort (and somewhat increased risk) of debugging a kernel module.
Although your code is fairly short, I don't see that you have taken any steps to ensure that the kernel process remains bound to one core. The Linux kernel MSR driver (arch/x86/kernel/msr.c) uses the function "rdmsr_safe_on_cpu" (see arch/x86/include/asm/msr.h and arch/x86/lib/msr-smp.c) to ensure that the MSR is always read on the specific core targeted by the /dev/cpu/*/msr device driver.
In any case, as part of the debugging process you should have the kernel print out the values of MSR_PERF_GLOBAL_CTRL (MSR 0x38F), and IA32_PERFEVTSEL[0-3] (MSRs 0x186-0x189).
The "rdtscp" instruction can also be used to check which core the kernel process is running on at any point. The rdtscp instruction returns the contents of the IA32_TSC_AUX MSR (MSR 0xC0000103) in the %eac register. Linux sets up the IA32_TSC_AUX MSR (0xC0000103) separately on each core so that the bottom 12 bits contain the core number.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Ooops -- it looks like you are configuring the counters to increment in user mode only, so I would not expect them to be incremented while you remain in the kernel device driver. You will need to set bit 17 of the performance counter event select register to count in kernel mode, so (for example), 0x004101C2 should be changed to 0x004301C2, etc.
Of course the easiest ways to get these values for the whole program is to use "perf stat", which definitely knows how to set up all the control registers.
To see how "perf stat" has configured the registers I usually do something like:
taskset -c 1 perf stat -er004301c2 cat >/dev/null & # bind to core 1 and stall waiting on standard input
rdmsr -p 1 -x -0 0x38f
rdmsr -p 1 -x -0 0x186
rdmsr -p 1 -x -0 0x187
rdmsr -p 1 -x -0 0x188
rdmsr -p 1 -x -0 0x189
The kernel will disable the counters when the task is not running, but it will do this by clearing bit 22 of the PERFEVTSEL registers and leaving most of the rest of the bits alone. On my system it sets bit 20 (enabling interrupts) when it clears bit 22, but certainly leaves the event select and umask bits alone so I can immediately see which of the counters it is using for the event I requested.
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page