* Avi Kivity (avi@redhat.com) wrote:Yep. A quick test (shown below) gives the cost of a TLB miss on the Intel Xeon E5404: Number of cycles added over test baseline: tlb and cache hit: 12.42 tlb hit, l2 hit, l1 miss 17.88 tlb hit,l2+l1 miss 32.34 tlb and cache miss 449.58 So it's closer to 500 per tlb miss. Also, your analysis does not seem to correctly represent reality of the TLB trashing cost. On a workload walking over a large number of random pages (e.g. a large hash table) all the time, eating just a few more TLB entries will impact the number of misses over the entire workload. So it's not much the misses that we see at the tracing site that is the problem, but also the extra misses taken by the application caused by the extra pressure on TLB. So just a few more TLB entries taken by the tracer will likely hurt these workloads. The performance hit is not taken if the scheduler schedules another thread with the same mapping, only when it schedules a different process. Depending on the tracer design, the avg. event size can range from 12 bytes (lttng is very agressive in event size compaction) to about 40 bytes (perf); so for this you are mostly right. However, as explained above, the TLB miss cost is higher than you expected. I tested it in the past, and must admit that I changed from a vmalloc-based implementation to page-based using software cross-page write primitives based on feedback from Steven and Ingo. Diminishing TLB trashing seemed like a good approach, and using vmalloc on 32-bit machines is a pain, because users have to tweak the vmalloc region size at boot. So all in all, I moved to a vmalloc-less implementation without much more thought. If you feel we should test the performance of both approaches, we could do it in the generic ring buffer library (it allows both type of allocation backends). However, we'd have to find the right type of TLB-trashing real-world workload to have meaningful results. This might be the hardest part. Thanks, Mathieu # tlbmiss.c #include <sys/time.h> #include <time.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> typedef unsigned long long cycles_t; #define barrier() __asm__ __volatile__("": : :"memory") /* * Serialize core instruction execution. Also acts as a compiler barrier. * On PIC ebx cannot be clobbered */ #ifdef __PIC__ #define sync_core() \ asm volatile("push %%ebx; cpuid; pop %%ebx" \ : : : "memory", "eax", "ecx", "edx"); #endif #ifndef __PIC__ #define sync_core() \ asm volatile("cpuid" : : : "memory", "eax", "ebx", "ecx", "edx"); #endif #define mb() asm volatile("mfence":::"memory") #define smp_mb() mb() #define rdtsc(low,high) \ __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high)) #define rdtscl(low) \ __asm__ __volatile__("rdtsc" : "=a" (low) : : "edx") #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) #define mb() asm volatile("mfence":::"memory") static inline cycles_t get_cycles_sync(void) { unsigned long long ret = 0; smp_mb(); sync_core(); rdtscll(ret); sync_core(); smp_mb(); return ret; } #define PAGE_SIZE 4096ULL /* 4k */ #define L1_CACHELINE_SIZE 64 #define L2_CACHELINE_SIZE 128 #define ARRAY_SIZE 262144ULL /* 1 GB */ static char testpage[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); static unsigned int idx[ARRAY_SIZE]; #define NR_TESTS 100 int main(int argc, char **argv) { struct timeval tv; struct timezone tz; cycles_t time1, time2; double cycles_per_iter; unsigned int i, j; pid_t pid; char *array; double baseline; printf("number of tests : %lu\n", NR_TESTS); srandom(get_cycles_sync()); array = malloc(sizeof(char) * ARRAY_SIZE * PAGE_SIZE); for (i=0; i<ARRAY_SIZE; i++) idx[i] = random() % ARRAY_SIZE; testpage[0] = 1; printf("Nothing (baseline)\n"); cycles_per_iter = 0.0; for (i=0; i<NR_TESTS; i++) { for (j=0; j<ARRAY_SIZE; j++) array[idx[j] * PAGE_SIZE] = 1; testpage[0] = 1; time1 = get_cycles_sync(); time2 = get_cycles_sync(); cycles_per_iter += (time2 - time1); } cycles_per_iter /= (double)NR_TESTS; baseline = (double) cycles_per_iter; printf("Baseline takes %g cycles\n", baseline); printf("TLB and caches hit\n"); cycles_per_iter = 0.0; for (i=0; i<NR_TESTS; i++) { for (j=0; j<ARRAY_SIZE; j++) array[idx[j] * PAGE_SIZE] = 1; testpage[0] = 1; time1 = get_cycles_sync(); testpage[0] = 1; time2 = get_cycles_sync(); cycles_per_iter += (time2 - time1); } cycles_per_iter /= (double)NR_TESTS; printf("tlb and cache hit %g cycles (adds %g)\n", (double) cycles_per_iter, (double) cycles_per_iter - baseline); printf("TLB hit, l2 cache hit, l1 cache miss\n"); cycles_per_iter = 0.0; for (i=0; i<NR_TESTS; i++) { for (j=0; j<ARRAY_SIZE; j++) array[idx[j] * PAGE_SIZE] = 1; testpage[0] = 1; time1 = get_cycles_sync(); testpage[L1_CACHELINE_SIZE] = 1; time2 = get_cycles_sync(); cycles_per_iter += (time2 - time1); } cycles_per_iter /= (double)NR_TESTS; printf("tlb hit, l2 hit, l1 miss %g cycles (adds %g)\n", (double) cycles_per_iter, (double) cycles_per_iter - baseline); printf("TLB hit, l2 cache miss, l1 cache miss\n"); cycles_per_iter = 0.0; for (i=0; i<NR_TESTS; i++) { for (j=0; j<ARRAY_SIZE; j++) array[idx[j] * PAGE_SIZE] = 1; testpage[0] = 1; time1 = get_cycles_sync(); testpage[L2_CACHELINE_SIZE] = 1; time2 = get_cycles_sync(); cycles_per_iter += (time2 - time1); } cycles_per_iter /= (double)NR_TESTS; printf("tlb hit,l2+l1 miss %g cycles (adds %g)\n", (double) cycles_per_iter, (double) cycles_per_iter - baseline); printf("TLB and cache miss\n"); cycles_per_iter = 0.0; for (i=0; i<NR_TESTS; i++) { for (j=0; j<ARRAY_SIZE; j++) array[idx[j] * PAGE_SIZE] = 1; time1 = get_cycles_sync(); testpage[0] = 1; time2 = get_cycles_sync(); cycles_per_iter += (time2 - time1); } cycles_per_iter /= (double)NR_TESTS; printf("tlb and cache miss %g cycles (adds %g)\n", (double) cycles_per_iter, (double) cycles_per_iter - baseline); free(array); return 0; } -- Mathieu Desnoyers Operating System Efficiency R&D Consultant EfficiOS Inc. http://www.efficios.com --
| Jesse Barnes | Re: [stable] [BUG][PATCH] cpqphp: fix kernel NULL pointer dereference |
| Greg KH | [003/136] p54usb: add Zcomax XG-705A usbid |
| Magnus Damm | [PATCH 03/07] ARM: Use shared GIC entry macros on Realview |
| Oliver Neukum | Re: [Bug #13682] The webcam stopped working when upgrading from 2.6.29 to 2.6.30 |
| Martin Schwidefsky | Re: [PATCH] optimized ktime_get[_ts] for GENERIC_TIME=y |
git: | |
| Junio C Hamano | Re: Some advanced index playing |
| Jeff King | Re: confusion over the new branch and merge config |
| Robin Rosenberg | Re: cvs2svn conversion directly to git ready for experimentation |
| Linus Torvalds | git binary size... |
| Ævar Arnfjörð Bjarmason | Re: Challenge with Git-Bash |
| Linux Kernel Mailing List | md: move allocation of ->queue from mddev_find to md_probe |
| Linux Kernel Mailing List | md: raid0: Represent zone->zone_offset in sectors. |
| Linux Kernel Mailing List | [ARM] S3C24XX: Add gpio_to_irq() facility |
| Linux Kernel Mailing List |
