Software Tuning, Performance Optimization & Platform Monitoring
Discussion regarding monitoring and software tuning methodologies, Performance Monitoring Unit (PMU) of Intel microprocessors, and platform updating.
1685 Discussions

Reading and Writing MSRs to counted uops issued for Intel IvyBridge


Here is a Kernel module written to read and write the msr for Intel processors. The header file msrdrv.h is:

#include <linux/ioctl.h>
 #include <linux/types.h>
 #define DEV_NAME "msrdrv"
 #define DEV_MAJOR 223
 #define DEV_MINOR 0
 #define MSR_VEC_LIMIT 32

enum MsrOperation {
MSR_NOP   = 0,
MSR_READ  = 1,
MSR_STOP  = 3,

struct MsrInOut {
unsigned int op;              // MsrOperation
unsigned int ecx;             // msr identifier
union msr_union {
    struct msr_struct{
        unsigned int eax;     // low double word
        unsigned int edx;     // high double word
    unsigned long long value; // quad word

The module file msrdrv.c:

#include <linux/module.h>      // for all modules 
#include <linux/init.h>        // for entry/exit macros 
#include <linux/kernel.h>      // for printk priority macros 
#include "msrdrv.h"

static long long read_msr(unsigned int ecx) {
unsigned int edx = 0, eax = 0;
unsigned long long result = 0;
__asm__ __volatile__("rdmsr" : "=a"(eax), "=d"(edx) : "c"(ecx));
result = eax | (unsigned long long)edx << 0x20;
printk(KERN_ALERT "Module msrdrv: Read 0x%016llx (0x%08x:0x%08x) from MSR 0x%08x\n", result, edx, eax, ecx);
return result;

static void write_msr(int ecx, unsigned int eax, unsigned int edx) {
printk(KERN_ALERT "Module msrdrv: Writing 0x%08x:0x%08x to MSR 0x%04x\n", edx, eax, ecx);
__asm__ __volatile__("wrmsr" : : "c"(ecx), "a"(eax), "d"(edx));

static long long read_tsc(void)
unsigned eax, edx;
long long result;
__asm__ __volatile__("rdtsc" : "=a"(eax), "=d"(edx));
result = eax | (unsigned long long)edx << 0x20;
printk(KERN_ALERT "Module msrdrv: Read 0x%016llx (0x%08x:0x%08x) from TSC\n", result, edx, eax);
return result;

static long msrdrv_test(unsigned long long ioctl_param, int n)
struct MsrInOut *msrops;
int i;
msrops = (struct MsrInOut*)ioctl_param;
for (i = 0 ; i <= n ; i++) {
    switch (msrops.op) {
    case MSR_NOP:
    case MSR_STOP:
        goto label_end;
    case MSR_READ:
        msrops.munion.value = read_msr(msrops.ecx);
    case MSR_WRITE:
        write_msr(msrops.ecx, msrops.munion.mstruct.eax, msrops.munion.mstruct.edx);
    case MSR_RDTSC:
        msrops.munion.value = read_tsc();
        return 1;

return 0;

static int msrdrv_init(void)
int i=0;

struct MsrInOut msr_start[] = {
    { MSR_WRITE, 0x38F, 0x00, 0x00 },       // ia32_perf_global_ctrl: disable 4 PMCs & 3 FFCs
    { MSR_WRITE, 0xc1, 0x00, 0x00 },        // ia32_pmc0: zero value (35-5)
    { MSR_WRITE, 0xc2, 0x00, 0x00 },        // ia32_pmc1: zero value (35-5)
    { MSR_WRITE, 0xc3, 0x00, 0x00 },        // ia32_pmc2: zero value (35-5)
    { MSR_WRITE, 0xc4, 0x00, 0x00 },        // ia32_pmc3: zero value (35-5)
{ MSR_WRITE, 0x186, 0x004101c2, 0x00 }, // ia32_perfevtsel1, UOPS_RETIRED.ALL (19-28)
    { MSR_WRITE, 0x187, 0x0041010e, 0x00 }, // ia32_perfevtsel0, UOPS_ISSUED.ANY (19.22)
    { MSR_WRITE, 0x188, 0x01c1010e, 0x00 }, // ia32_perfevtsel2, UOPS_ISSUED.ANY-stalls (19-22)
    { MSR_WRITE, 0x189, 0x004101a2, 0x00 }, // ia32_perfevtsel3, RESOURCE_STALLS.ANY (19-27)
    { MSR_WRITE, 0x38F, 0x0f, 0x07 },       // ia32_perf_global_ctrl: enable 4 PMCs & 3 FFCs
    { MSR_STOP, 0x00, 0x00 }

struct MsrInOut msr_stop[] = {
    { MSR_WRITE, 0x38F, 0x00, 0x00 },       // ia32_perf_global_ctrl: disable 4 PMCs & 3 FFCs
    { MSR_READ, 0xc1, 0x00 ,0x00},               // ia32_pmc0: read value (35-5)
    { MSR_READ, 0xc2, 0x00 ,0x00},               // ia32_pmc1: read value (35-5)
    { MSR_READ, 0xc3, 0x00 ,0x00},               // ia32_pmc2: read value (35-5)
    { MSR_READ, 0xc4, 0x00 , 0x00 },               // ia32_pmc3: read value (35-5)
    { MSR_STOP, 0x00, 0x00 }

printk(KERN_ALERT "Module " DEV_NAME " loaded\n");
msrdrv_test((unsigned long long)msr_start,11);

printk("HELLO This is a hex number 0x%x\n", -1);
msrdrv_test((unsigned long long)msr_stop,6);
printk("uops retired:    %7lld\n", msr_stop[1].munion.value);
printk("uops issued:     %7lld\n", msr_stop[2].munion.value);
printk("stalled cycles:  %7lld\n", msr_stop[3].munion.value);
printk("resource stalls: %7lld\n", msr_stop[4].munion.value);
return 0;

static void msrdrv_exit(void)
printk(KERN_ALERT "Module " DEV_NAME " unloaded\n");


The output should be the values of the counters.

The output is:  uops retired : 0, uops issued : 0, stalled cycles: 15248, resource stalls : 0.

The module is giving the output for the event stalled cycles only (counter c3). It is giving 0 values for the other three events. Why??

Pls help me to solve the issue for why it is only reading the stalled cycles only and not any other event


0 Kudos
2 Replies
Black Belt

It is probably easier to test the functionality using the "rdmsr.c" and "wrmsr.c" command-line programs from msrtools-1.2.  That way you can be sure that you have programmed the control registers correctly without the extra effort (and somewhat increased risk) of debugging a kernel module.

Although your code is fairly short, I don't see that you have taken any steps to ensure that the kernel process remains bound to one core.  The Linux kernel MSR driver (arch/x86/kernel/msr.c) uses the function "rdmsr_safe_on_cpu" (see arch/x86/include/asm/msr.h and arch/x86/lib/msr-smp.c) to ensure that the MSR is always read on the specific core targeted by the /dev/cpu/*/msr device driver.

In any case, as part of the debugging process you should have the kernel print out the values of MSR_PERF_GLOBAL_CTRL (MSR 0x38F), and IA32_PERFEVTSEL[0-3] (MSRs 0x186-0x189).

The "rdtscp" instruction can also be used to check which core the kernel process is running on at any point.  The rdtscp instruction returns the contents of the IA32_TSC_AUX MSR (MSR 0xC0000103) in the %eac register.  Linux sets up the IA32_TSC_AUX MSR (0xC0000103) separately on each core so that the bottom 12 bits contain the core number.

0 Kudos
Black Belt

Ooops -- it looks like you are configuring the counters to increment in user mode only, so I would not expect them to be incremented while you remain in the kernel device driver.  You will need to set bit 17 of the performance counter event select register to count in kernel mode, so (for example), 0x004101C2 should be changed to 0x004301C2, etc.

Of course the easiest ways to get these values for the whole program is to use "perf stat", which definitely knows how to set up all the control registers.

To see how "perf stat" has configured the registers I usually do something like:

taskset -c 1 perf stat -er004301c2 cat >/dev/null &                 # bind to core 1 and stall waiting on standard input

rdmsr -p 1 -x -0 0x38f

rdmsr -p 1 -x -0 0x186

rdmsr -p 1 -x -0 0x187

rdmsr -p 1 -x -0 0x188

rdmsr -p 1 -x -0 0x189

The kernel will disable the counters when the task is not running, but it will do this by clearing bit 22 of the PERFEVTSEL registers and leaving most of the rest of the bits alone.  On my system it sets bit 20 (enabling interrupts) when it clears bit 22, but certainly leaves the event select and umask bits alone so I can immediately see which of the counters it is using for the event I requested.

0 Kudos