diff --git a/libcpuid/libcpuid.h b/libcpuid/libcpuid.h index 4f98a3d..19311c1 100644 --- a/libcpuid/libcpuid.h +++ b/libcpuid/libcpuid.h @@ -678,6 +678,15 @@ int cpu_clock_measure(int millis, int quad_check); * * Recommended values - millis = 50, runs = 4. For more robustness, * increase the number of runs. + * + * NOTE: on Bulldozer and later CPUs, the busy-wait cycle runs at 1.4 IPC, thus + * the results are skewed. This is corrected internally by dividing the resulting + * value by 1.4. + * However, this only occurs if the thread is executed on a single CMT + * module - if there are other threads competing for resources, the results are + * unpredictable. Make sure you run cpu_clock_by_ic() on a CPU that is free from + * competing threads, or if there are such threads, they shouldn't exceed the + * number of modules. On a Bulldozer X8, that means 4 threads. * * @returns the CPU clock frequency in MHz (within some measurement error * margin). If SSE is not supported, the result is -1. If the input parameters diff --git a/libcpuid/rdtsc.c b/libcpuid/rdtsc.c index a202940..a13d983 100644 --- a/libcpuid/rdtsc.c +++ b/libcpuid/rdtsc.c @@ -231,18 +231,28 @@ int cpu_clock_by_ic(int millis, int runs) int max_value = 0, cur_value, i, ri, cycles_inner, cycles_outer, c; struct cpu_id_t* id; uint64_t t0, t1, tl, hz; - int sse_multiplier = 1; + int multiplier_numerator = 1, multiplier_denom = 1; if (millis <= 0 || runs <= 0) return -2; id = get_cached_cpuid(); + // if there aren't SSE instructions - we can't run the test at all if (!id || !id->flags[CPU_FEATURE_SSE]) return -1; // if (id->sse_size < 128) { debugf(1, "SSE execution path is 64-bit\n"); - sse_multiplier = 2; + // on a CPU with half SSE unit length, SSE instructions execute at 0.5 IPC; + // the resulting value must be multiplied by 2: + multiplier_numerator = 2; } else { debugf(1, "SSE execution path is 128-bit\n"); } // + // on a Bulldozer or later CPU, SSE instructions execute at 1.4 IPC, handle that as well: + if (id->vendor == VENDOR_AMD && id->ext_family >= 21) { + debugf(1, "cpu_clock_by_ic: Bulldozer (or later) detected, dividing result by 1.4\n"); + multiplier_numerator = 5; + multiplier_denom = 7; // multiply by 5/7, to divide by 1.4 + } + // tl = millis * 125; // (*1000 / 8) cycles_inner = 128; cycles_outer = 1; @@ -267,8 +277,8 @@ int cpu_clock_by_ic(int millis, int runs) // cpu_Hz = cycles_inner * cycles_outer * 256 / (t1 - t0) * 1000000 debugf(2, "c = %d, td = %llu\n", c, t1 - t0); hz = ((uint64_t) cycles_inner * (uint64_t) 256 + 12) * - (uint64_t) cycles_outer * (uint64_t) sse_multiplier * (uint64_t) c * (uint64_t) 1000000 - / (t1 - t0); + (uint64_t) cycles_outer * (uint64_t) multiplier_numerator * (uint64_t) c * (uint64_t) 1000000 + / ((t1 - t0) * (uint64_t) multiplier_denom); cur_value = (int) (hz / 1000000); if (cur_value > max_value) max_value = cur_value; }