KNL & first touch memory policy

aidan_c_ · ‎03-05-2018

Hi - I've been playing around with libnuma recently, exploring the options available and seeing the costs/benefits of various features.

When using the KNL in SNC4, I've been finding some unexpected behaviour about which NUMA node each memory page is placed upon.

I created a code (which I'l place at the end of this post), which allocates a large page aligned array, and loops over it across all threads in chunks of the page size, which should create an array spread over the NUMA nodes according to the NUMA placement of each thread, which is what happens when I run on Skylake:

Thread NUMA allocation: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Page distribution:
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
.......

However, when I run on the KNL machine I have access to, I instead get this

Thread NUMA allocation:
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0

Page distribution:
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
....(a bunch of 1s, then )
1 1 1 1 1 1 1 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0
0 0 1 1 2 2 1 1 3 3 2 2 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 1 1 3 3
2 2 0 0 1 1 3 3 0 0 1 1 3 3 2 2 0 0 1 1 3 3 2 2 0 0 3 3 2 2 0 0

From printing inside the loop, I'm told Thread 0 accesses page 256 (as expected), yet this page is still assigned to NUMA node 1 (wheras thread 0 is bound to NUMA node 0). I'm waiting for the same code to run on another KNL machine at the moment - but it appears that for some reason the pages in the middle are either touched by something else (unclear what that would be) or are ignoring the first touch memory policy.

The code was run with: KMP_AFFINITY=verbose,granularity=core,physical OMP_NUM_THREADS=64 on KNL (64 core) and the same but 32 threads on skylake (2x16 core skylake gold).

Test code (I know the OpenMP/Pthread hack is not a good idea but it works here, the code this is for is a pthread code):

#define _GNU_SOURCE


#include <numa.h>
#include <numaif.h>
#include <stdio.h>
#include <asm/errno.h>
#include <pthread.h>
#include <omp.h>
#include <sched.h>

#define ARRAYSIZE 1024
int main(){

  if(numa_available() == -1) return -1;

  const int num_numa_nodes = numa_num_configured_nodes();
  const struct bitmask *d = numa_get_mems_allowed();
  printf("Allowed nodes:\n");
  for(int i = 0; i < num_numa_nodes; i++){
    printf("Node %i: %i\n", i, numa_bitmask_isbitset(d,i));
  }

  const struct bitmask *d2 = numa_get_membind();
  printf("Memory bound nodes:\n");
  for(int i = 0; i < num_numa_nodes; i++){
    printf("Node %i: %i\n", i, numa_bitmask_isbitset(d,i));
  }
  printf("Nodes = %i\n", num_numa_nodes);
  const int page_size_bytes = numa_pagesize();
  const int page_size = numa_pagesize()/sizeof(int);
  int *array;
  const int size = page_size*ARRAYSIZE;

//  array = malloc(sizeof(int)*size);
  posix_memalign((void**)&array, page_size_bytes, size*sizeof(int));
#pragma omp parallel
{

pthread_t thread = pthread_self();
cpu_set_t affinity;
CPU_ZERO(&affinity);
pthread_getaffinity_np(thread,sizeof(cpu_set_t), &affinity);
int pin;
for(int j = 0; j < omp_get_num_threads(); j++){
  if(j==omp_get_thread_num()){
    for(pin = 0; pin < CPU_SETSIZE; pin++){
      if(CPU_ISSET(pin, &affinity)){
//        printf("Thread %i on core %i, NUMA node %i\n", omp_get_thread_num(), pin, numa_node_of_cpu(pin));
        printf("%i ", numa_node_of_cpu(pin));
        break;
      }
    }
    if(j%32 == 31) printf("\n");
  }
  #pragma omp barrier
}

#pragma omp for schedule(static, page_size)
  for(int i = 0; i < size; i++){
    if(i%page_size == 0) printf("Thread %i on page %i\n", omp_get_thread_num(), i/page_size);
    array = i;
  }

}
  printf("%\n");
  void **page_list = malloc(sizeof(void*)*ARRAYSIZE);
  int j = 0;

  for(int i = 8; i < size; i+=page_size){
    void *pointy = (void *) (((long long int) &array) & (~((long long int)page_size_bytes-1LL)));
//    printf("%i\n", ((int*)pointy)-array);
    page_list = pointy;
    j++;
  }
  int numas[ARRAYSIZE];
  for(int i = 0; i < ARRAYSIZE; i++)
    numas = -2;
  long error = numa_move_pages(0, (unsigned long) ARRAYSIZE, page_list, NULL, numas, MPOL_MF_MOVE);
  if (error > 0L) printf("Oh dear\n");
  printf("Initial locations:\n");
  for(int i = 0; i < ARRAYSIZE; i++){
    printf("%i ", numas);
    if(i %32 == 31){
      printf("\n");
    }
  }
  printf("\n");

  int newloc[ARRAYSIZE];
  for(int i = 0; i < ARRAYSIZE; i++)
    newloc = i&1;


  error = numa_move_pages(0, (unsigned long) ARRAYSIZE, page_list, newloc, numas, MPOL_MF_MOVE);
  if (error > 0L) printf("Oh dear\n");
  printf("Final location:\n");
  printf("%i \n", numas[0]);
  printf("%i \n", numas[1]);
//  printf("%i \n", numas[17000]);

  return 0;
}