OSTEP:6 Mechanism:Limited Direct Execution Homework

bxz

2023-04-15

书里说的rdtsc(TSC是一个64位的寄存器记录处理器从启动到现在的时钟周期数)把高32位放入edx，低32位放入rax，在多核处理器好像已经不太准确，

CPU乱序执行之后，无法保证 rdtsc 指令的执行一定是在业务代码执行的之前和之后
处理器的变频
无法保证每个CPU核心的 TSC 寄存器是同步的

引入了常量速率TSC的特性解决了变频问题 cat /proc/cpuinfo | grep constant_tsc查看处理器是否支持，但是不能估计时间

对于乱序问题我们可以加cpu级的memory barriermfence

或者用rdtscp,他具有序列化特性，即它会在指令执行前等待之前的所有指令都执行完毕，并且在指令执行后保证没有后续指令会在它之前执行cat /proc/cpuinfo | grep rdtscp查看是否支持

#include <stdio.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
 #include <unistd.h>
#include <fcntl.h>
#include <assert.h>
int main() {
    uint64_t period = 0, period2 = 0;
    uint64_t low = 0, hight = 0,low2 = 0, hight2 = 0;
    int fd = open("./test.c", O_RDWR);
    assert(-1 != fd);
    __asm__ volatile("rdtscp":"=a"(low), "=d"(hight));
    period = hight << 32 | low;
    // __asm__ __volatile__ ("mfence" : : : "memory");//如果要使用rdtsc需要加barrier来防止乱序
    for (size_t i = 0; i < 1000000; i++)
    {
        read(fd, NULL, 0);
    }
    // __asm__ __volatile__ ("mfence" : : : "memory");
    __asm__ volatile("rdtscp":"=a"(low), "=d"(hight));
    period2 = hight << 32 | low;
    printf("%lu\n", period/1000000);
    printf("%lu\n", period2/1000000);
    printf("%lu\n", (period2 - period)/1000000);
    puts("-----------");
    close(fd);
}

grxer@grxer ~/D/s/O/o/6> while true ;taskset -c 1 sudo ./test;end
35475179
35475567
387
-----------
35475595
35475995
399
-----------
35476026
35476421
395
-----------
35476449
35476848
399
-----------
35476881
35477275
393
-----------
35477304
35477695
390
-----------
^C⏎

如果想要一个准确时间可以用clock_gettime，会比gettimeofday准一点,gettimeofday和clock_gettime函数返回的是当前系统时钟的时间值，而不是指令的执行时间，所以没用barrier来防止乱序

#include <stdio.h>
#include <stdint.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <time.h>
#include <assert.h>
int main() {

    int fd = open("./test.c", O_RDWR);
    assert(-1 != fd);
    struct timespec start_time, end_time;
    /*
    struct timespec {
    time_t tv_sec;  // 秒
    long tv_nsec;   // 纳秒
    };
    */
    clock_gettime(CLOCK_MONOTONIC, &start_time);
    for (size_t i = 0; i < 1000000; i++)
    {
        read(fd, NULL, 0);
    }
    clock_gettime(CLOCK_MONOTONIC, &end_time);
    double elapsed_time = (end_time.tv_sec * 1e9 + end_time.tv_nsec - start_time.tv_sec * 1e9 - start_time.tv_nsec)/1000000;
    printf("ns:%f\n", elapsed_time);
    printf("us:%f\n", elapsed_time / 1000.0);
    printf("ms:%f\n", elapsed_time / 1e6);
    
}

grxer@grxer ~/D/s/O/o/6 [SIGINT]> gcc -g -o  test test.c
grxer@grxer ~/D/s/O/o/6> while true ;taskset -c 1 sudo ./test;end
ns:127.141074
us:0.127141
ms:0.000127
ns:127.567767
us:0.127568
ms:0.000128
ns:126.079839
us:0.126080
ms:0.000126
ns:127.143936
us:0.127144
ms:0.000127
ns:126.850415
us:0.126850
ms:0.000127
ns:121.985531
us:0.121986
ms:0.000122
ns:119.445469
us:0.119445
ms:0.000119
ns:119.942527
us:0.119943
ms:0.000120
^C⏎

两种方式比较一下，看了下cpu主频cpu MHz : 3193.924 那么一条指令周期1 / 3193.924MHz = 0.312975 ns，0.312975*395=123.625125ns

两种方式相差无几

需要测量上下文切换成本，

需要把程序限制到一个cpu上做切换，这次我们用sched_setaffinity来实现绑定指定处理器而不是用taskset(PS:简单测试了一下fork子进程是会跑在父进程的cpu亲合力掩码)

#define _GNU_SOURCE             /* See feature_test_macros(7) */
#include <sched.h>
int sched_setaffinity(pid_t pid, size_t cpusetsize,
                      const cpu_set_t *mask);
pid为0设置当前进程否则pid进程
cpusetsize一般给sizeof(cpu_set_t)
mask CPU位掩码 的操作
CPU_ZERO()：清除集合的内容，让其不包含任何CPU。

CPU_SET()：添加cpu到集合中。

CPU_CLR()：从集合中移除cpu

CPU_ISSET() ：测试cpu是否在集合中。

CPU_COUNT()：返回集合中包含的CPU数量。

#define _GNU_SOURCE        
#include <sched.h>
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
#include <assert.h>
#include <time.h>
#include <stdlib.h>   // exit
#include <sys/wait.h> // waitpid
#include <unistd.h>   // fork, pipe, close, write, dup2
int main() {
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(3, &mask);
    char buf[2];
    pid_t rc;
    int re = sched_setaffinity(0, sizeof(mask), &mask);
    if (-1 == re) {
        perror("sched_setscheduler");
        exit(EXIT_FAILURE);
    }
    int pipefd1[2], pipefd2[2];
    if (pipe(pipefd1) < 0 || pipe(pipefd2) < 0) {
        perror("pipe");
        exit(EXIT_FAILURE);
    };
    rc = fork();
    if (rc < 0)
        exit(-1);
    else if (rc == 0) {
        for (size_t i = 0; i < 1000000; i++) {
            // printf("----------------------%d\n", i);
            read(pipefd1[0], buf, 1);
            write(pipefd2[1], "a", 1);
        }
    }
    else {
        struct timespec start_time, end_time;
        clock_gettime(CLOCK_MONOTONIC, &start_time);
        for (size_t i = 0; i < 1000000; i++) {
            // printf("+++++%d\n", i);
            write(pipefd1[1], "0", 1);
            read(pipefd2[0], buf, 1);
        }
        clock_gettime(CLOCK_MONOTONIC, &end_time);
        double elapsed_time = (end_time.tv_sec * 1e9 + end_time.tv_nsec - start_time.tv_sec * 1e9 - start_time.tv_nsec) / 1000000;
        printf("ns:%f\n", elapsed_time);
        printf("us:%f\n", elapsed_time / 1000.0);
        printf("ms:%f\n", elapsed_time / 1e6);

    }
}

查看你一些cpu affinity父子都绑定到了8(100)就是我们的设置的第三个cpu CPU_SET(3, &mask);

grxer@grxer ~> ps -a
    PID TTY          TIME CMD
   2677 pts/12   00:00:04 fish
   8612 tty2     00:00:00 gnome-session-b
   9251 pts/13   00:00:00 tmux: client
   9276 pts/14   00:00:00 fish
   9316 pts/15   00:00:00 fish
  36681 pts/14   00:00:01 gdb
  36697 pts/14   00:00:00 test <defunct>
  48284 pts/23   00:00:00 fish
  50253 pts/24   00:00:00 fish
  54215 pts/38   00:00:02 2
  54216 pts/38   00:00:02 2
  54309 pts/24   00:00:00 ps
grxer@grxer ~> taskset -p 54216
pid 54216's current affinity mask: 8
grxer@grxer ~> taskset -p 54215
pid 54215's current affinity mask: 8

交替阻塞，交替上下文切换

grxer@grxer ~/D/s/O/o/6 [SIGINT]> gcc -g -o 2 2.c
grxer@grxer ~/D/s/O/o/6> ./2
ns:4238.650005
us:4.238650
ms:0.004239

这个时间应该再需要减去两次写入，两次读取时间，懒得搞了