The this_cpu_* options can be used to optimize __percpu_counter_add a bit. Avoids
some address arithmetic and saves 12 bytes.
Before:
00000000000001d3 <__percpu_counter_add>:
1d3: 55 push %rbp
1d4: 48 89 e5 mov %rsp,%rbp
1d7: 41 55 push %r13
1d9: 41 54 push %r12
1db: 53 push %rbx
1dc: 48 89 fb mov %rdi,%rbx
1df: 48 83 ec 08 sub $0x8,%rsp
1e3: 4c 8b 67 30 mov 0x30(%rdi),%r12
1e7: 65 4c 03 24 25 00 00 add %gs:0x0,%r12
1ee: 00 00
1f0: 4d 63 2c 24 movslq (%r12),%r13
1f4: 48 63 c2 movslq %edx,%rax
1f7: 49 01 f5 add %rsi,%r13
1fa: 49 39 c5 cmp %rax,%r13
1fd: 7d 0a jge 209 <__percpu_counter_add+0x36>
1ff: f7 da neg %edx
201: 48 63 d2 movslq %edx,%rdx
204: 49 39 d5 cmp %rdx,%r13
207: 7f 1e jg 227 <__percpu_counter_add+0x54>
209: 48 89 df mov %rbx,%rdi
20c: e8 00 00 00 00 callq 211 <__percpu_counter_add+0x3e>
211: 4c 01 6b 18 add %r13,0x18(%rbx)
215: 48 89 df mov %rbx,%rdi
218: 41 c7 04 24 00 00 00 movl $0x0,(%r12)
21f: 00
220: e8 00 00 00 00 callq 225 <__percpu_counter_add+0x52>
225: eb 04 jmp 22b <__percpu_counter_add+0x58>
227: 45 89 2c 24 mov %r13d,(%r12)
22b: 5b pop %rbx
22c: 5b pop %rbx
22d: 41 5c pop %r12
22f: 41 5d pop %r13
231: c9 leaveq
232: c3 retq
After:
00000000000001d3 <__percpu_counter_add>:
1d3: 55 push %rbp
1d4: 48 63 ca movslq %edx,%rcx
1d7: 48 89 e5 mov %rsp,%rbp
1da: 41 54 push %r12
1dc: 53 push %rbx
1dd: 48 89 fb ...