使用时间戳计数器获取时间戳

Question

我使用下面的代码来获取处理器的时钟周期

unsigned long long rdtsc(void)
{
  unsigned hi, lo;
  __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

我得到一些值，比如 43，但是这里的单位是什么？是微秒还是纳秒

我使用下面的代码来获取我的电路板的频率。

cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_cur_freq
1700000

我还使用下面的代码来查找我的处理器速度

dmidecode -t processor | grep "Speed"
Max Speed: 3700 MHz
Current Speed: 3700 MHz

现在如何使用上述频率并将其转换为微秒或毫秒？

Answer 1

对所述问题 "how do I convert the TSC frequency to microseconds or milliseconds?" 的简单回答是：你没有。 TSC（时间戳计数器）时钟频率实际是什么，因硬件而异，并且在运行time 期间可能会有所不同。要实时测量，请在 Linux.

中使用 clock_gettime(CLOCK_REALTIME) 或 clock_gettime(CLOCK_MONOTONIC)

正如 Peter Cordes 在评论（2018 年 8 月）中提到的那样，在大多数当前的 x86-64 架构上，时间戳计数器（通过 RDTSC 指令和 <x86intrin.h> 中声明的 __rdtsc() 函数访问）计数参考时钟周期，而不是 CPU 时钟周期。他的 answer to a similar question in C++ 在 x86-64 上的 Linux 中也对 C 有效，因为编译器在编译 C 或 C++ 时提供了底层内置，其余答案涉及硬件细节。我也推荐阅读那个。

此答案的其余部分假定潜在问题是微基准测试代码，以找出某些函数的两个实现如何相互比较。

在 x86（Intel 32 位）和 x86-64（AMD64、Intel 和 AMD 64 位）架构上，您可以使用 <x86intrin.h> 中的 __rdtsc() 来找出 TSC 的数量时钟周期过去了。这可用于测量和比较某些功能的不同实现所使用的周期数，通常是很多次。

请注意，关于 TSC 时钟与 CPU 时钟的关系存在硬件差异。上面提到的最近的答案对此进行了一些详细介绍。对于 Linux 中的实用目的，在 Linux 中使用 cpufreq-set 禁用频率缩放就足够了（以确保 CPU 和 TSC 频率之间的关系在微基准测试期间不会改变)，并可选择 taskset 将微基准测试限制为特定的 CPU 个核心。这确保了在该微基准测试中收集的结果可以相互比较。

（正如 Peter Cordes 评论的那样，我们还想从 <emmintrin.h> 添加 _mm_lfence()（包含在 <immintrin.h> 中）。这确保了 CPU 不会在内部重新排序RDTSC 操作与要进行基准测试的函数相比。如果需要，您可以在编译时使用 -DNO_LFENCE 来省略这些操作。）

假设您有要比较的函数 void foo(void); 和 void bar(void);：

#include <stdlib.h>
#include <x86intrin.h>
#include <stdio.h>

#ifdef    NO_LFENCE
#define   lfence()
#else
#include <emmintrin.h>
#define   lfence()  _mm_lfence()
#endif

static int cmp_ull(const void *aptr, const void *bptr)
{
    const unsigned long long  a = *(const unsigned long long *)aptr;
    const unsigned long long  b = *(const unsigned long long *)bptr;
    return (a < b) ? -1 :
           (a > b) ? +1 : 0;
}

unsigned long long *measure_cycles(size_t count, void (*func)())
{
    unsigned long long  *elapsed, started, finished;
    size_t               i;

    elapsed = malloc((count + 2) * sizeof elapsed[0]);
    if (!elapsed)
        return NULL;

    /* Call func() count times, measuring the TSC cycles for each call. */
    for (i = 0; i < count; i++) {
        /* First, let's ensure our CPU executes everything thus far. */
        lfence();
        /* Start timing. */
        started = __rdtsc();
        /* Ensure timing starts before we call the function. */
        lfence();
        /* Call the function. */
        func();
        /* Ensure everything has been executed thus far. */
        lfence();
        /* Stop timing. */
        finished = __rdtsc();
        /* Ensure we have the counter value before proceeding. */
        lfence();

        elapsed[i] = finished - started;
    }

    /* The very first call is likely the cold-cache case,
       so in case that measurement might contain useful
       information, we put it at the end of the array.
       We also terminate the array with a zero. */
    elapsed[count] = elapsed[0];
    elapsed[count + 1] = 0;

    /* Sort the cycle counts. */
    qsort(elapsed, count, sizeof elapsed[0], cmp_ull);

    /* This function returns all cycle counts, in sorted order,
       although the median, elapsed[count/2], is the one
       I personally use. */
    return elapsed;
}

void benchmark(const size_t count)
{
    unsigned long long  *foo_cycles, *bar_cycles;

    if (count < 1)
        return;

    printf("Measuring run time in Time Stamp Counter cycles:\n");
    fflush(stdout);

    foo_cycles = measure_cycles(count, foo);
    bar_cycles = measure_cycles(count, bar);

    printf("foo(): %llu cycles (median of %zu calls)\n", foo_cycles[count/2], count);
    printf("bar(): %llu cycles (median of %zu calls)\n", bar_cycles[count/2], count);

    free(bar_cycles);
    free(foo_cycles);
}

请注意，以上结果非常特定于所使用的编译器和编译器选项，当然在硬件上也是运行。周期的中位数可以解释为"the typical number of TSC cycles taken"，因为测量并不完全可靠（可能会受到进程外事件的影响；例如，通过上下文切换，或者通过迁移到某些[=77上的另一个核心=]s）。出于同样的原因，我不相信最小值、最大值或平均值。

然而，这两个实现的（foo() 和 bar()）循环计数高于可以进行比较，以了解它们的性能如何相互比较，在微基准测试中。请记住，微基准测试结果可能不会扩展到实际工作任务，因为任务的资源使用交互非常复杂。一个函数可能在所有微基准测试中都优于其他函数，但在现实世界中却比其他函数差，因为它只有在有大量 CPU 缓存可供使用时才有效，例如。

在Linux中，一般来说，您可以使用CLOCK_REALTIME时钟来测量所用的实际时间（挂钟时间），方法与上述相同。 CLOCK_MONOTONIC 更好，因为它不受管理员可能对实时时钟所做的直接更改的影响（例如，如果他们注意到系统时钟提前或落后）；仅应用因 NTP 等引起的漂移调整。使用任何一个时钟，夏令时或其变化都不会影响测量。同样，我寻求的是多次测量的中值，因为被测量代码本身之外的事件会影响结果。

例如：

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#ifdef   NO_LFENCE
#define  lfence()
#else
#include <emmintrin.h>
#define  lfence() _mm_lfence()
#endif

static int cmp_double(const void *aptr, const void *bptr)
{
    const double a = *(const double *)aptr;
    const double b = *(const double *)bptr;
    return (a < b) ? -1 :
           (a > b) ? +1 : 0;
}

double median_seconds(const size_t count, void (*func)())
{
    struct timespec started, stopped;
    double         *seconds, median;
    size_t          i;

    seconds = malloc(count * sizeof seconds[0]);
    if (!seconds)
        return -1.0;

    for (i = 0; i < count; i++) {
        lfence();
        clock_gettime(CLOCK_MONOTONIC, &started);
        lfence();
        func();
        lfence();
        clock_gettime(CLOCK_MONOTONIC, &stopped);
        lfence();
        seconds[i] = (double)(stopped.tv_sec - started.tv_sec)
                   + (double)(stopped.tv_nsec - started.tv_nsec) / 1000000000.0;
    }

    qsort(seconds, count, sizeof seconds[0], cmp_double);
    median = seconds[count / 2];
    free(seconds);
    return median;
}

static double realtime_precision(void)
{
    struct timespec t;

    if (clock_getres(CLOCK_REALTIME, &t) == 0)
        return (double)t.tv_sec
             + (double)t.tv_nsec / 1000000000.0;

    return 0.0;
}

void benchmark(const size_t count)
{
    double median_foo, median_bar;
    if (count < 1)
        return;

    printf("Median wall clock times over %zu calls:\n", count);
    fflush(stdout);

    median_foo = median_seconds(count, foo);
    median_bar = median_seconds(count, bar);

    printf("foo(): %.3f ns\n", median_foo * 1000000000.0);
    printf("bar(): %.3f ns\n", median_bar * 1000000000.0);

    printf("(Measurement unit is approximately %.3f ns)\n", 1000000000.0 * realtime_precision());
    fflush(stdout);
}

总的来说，我个人更喜欢在一个单独的单元中编译基准函数（到一个单独的目标文件），并且还基准一个什么都不做的函数来估计函数调用开销（尽管它往往会高估对于开销；即产生太大的开销估计，因为一些函数调用开销是 latencies 而不是实际花费的时间，并且在实际函数的这些延迟期间可能会进行一些操作） .

重要的是要记住，以上测量值只能用作指示，因为在现实世界的应用程序中，缓存局部性（尤其是在当前机器上，具有多级缓存和大量内存）之类的东西非常重要影响不同实现使用的时间。

例如，您可以比较快速排序和基数排序的速度。根据键的大小，基数排序需要相当大的额外数组（并使用大量缓存）。如果使用排序例程的实际应用程序不会同时使用大量其他内存（因此排序后的数据基本上是缓存的数据），那么如果有足够的数据（并且实现是合理的），基数排序会更快).但是，如果应用程序是多线程的，并且其他线程混洗（复制或传输）大量内存，那么使用大量缓存的基数排序将驱逐其他缓存的数据；即使基数排序函数本身没有显示出任何严重的减速，它可能会减慢其他线程的速度，从而减慢整个程序的速度，因为其他线程必须等待它们的数据被重新-缓存。

这意味着您唯一应该信任的 "benchmarks" 是实际硬件上使用的挂钟测量，运行将实际工作任务与实际工作数据结合起来。其他一切都取决于许多条件，或多或少是可疑的：迹象，是的，但不是很可靠。

使用时间戳计数器获取时间戳

Using Time stamp counter to get the time stamp

x86

timer

linux-device-driver

linux-kernel