perf.py 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344
  1. #!/usr/bin/env python3
  2. #
  3. # Script to aggregate and report Linux perf results.
  4. #
  5. # Example:
  6. # ./scripts/perf.py -R -obench.perf ./runners/bench_runner
  7. # ./scripts/perf.py bench.perf -j -Flfs.c -Flfs_util.c -Scycles
  8. #
  9. # Copyright (c) 2022, The littlefs authors.
  10. # SPDX-License-Identifier: BSD-3-Clause
  11. #
  12. import bisect
  13. import collections as co
  14. import csv
  15. import errno
  16. import fcntl
  17. import functools as ft
  18. import itertools as it
  19. import math as m
  20. import multiprocessing as mp
  21. import os
  22. import re
  23. import shlex
  24. import shutil
  25. import subprocess as sp
  26. import tempfile
  27. import zipfile
  28. # TODO support non-zip perf results?
  29. PERF_PATH = ['perf']
  30. PERF_EVENTS = 'cycles,branch-misses,branches,cache-misses,cache-references'
  31. PERF_FREQ = 100
  32. OBJDUMP_PATH = ['objdump']
  33. THRESHOLD = (0.5, 0.85)
  34. # integer fields
  35. class Int(co.namedtuple('Int', 'x')):
  36. __slots__ = ()
  37. def __new__(cls, x=0):
  38. if isinstance(x, Int):
  39. return x
  40. if isinstance(x, str):
  41. try:
  42. x = int(x, 0)
  43. except ValueError:
  44. # also accept +-∞ and +-inf
  45. if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
  46. x = m.inf
  47. elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
  48. x = -m.inf
  49. else:
  50. raise
  51. assert isinstance(x, int) or m.isinf(x), x
  52. return super().__new__(cls, x)
  53. def __str__(self):
  54. if self.x == m.inf:
  55. return '∞'
  56. elif self.x == -m.inf:
  57. return '-∞'
  58. else:
  59. return str(self.x)
  60. def __int__(self):
  61. assert not m.isinf(self.x)
  62. return self.x
  63. def __float__(self):
  64. return float(self.x)
  65. none = '%7s' % '-'
  66. def table(self):
  67. return '%7s' % (self,)
  68. diff_none = '%7s' % '-'
  69. diff_table = table
  70. def diff_diff(self, other):
  71. new = self.x if self else 0
  72. old = other.x if other else 0
  73. diff = new - old
  74. if diff == +m.inf:
  75. return '%7s' % '+∞'
  76. elif diff == -m.inf:
  77. return '%7s' % '-∞'
  78. else:
  79. return '%+7d' % diff
  80. def ratio(self, other):
  81. new = self.x if self else 0
  82. old = other.x if other else 0
  83. if m.isinf(new) and m.isinf(old):
  84. return 0.0
  85. elif m.isinf(new):
  86. return +m.inf
  87. elif m.isinf(old):
  88. return -m.inf
  89. elif not old and not new:
  90. return 0.0
  91. elif not old:
  92. return 1.0
  93. else:
  94. return (new-old) / old
  95. def __add__(self, other):
  96. return self.__class__(self.x + other.x)
  97. def __sub__(self, other):
  98. return self.__class__(self.x - other.x)
  99. def __mul__(self, other):
  100. return self.__class__(self.x * other.x)
  101. # perf results
  102. class PerfResult(co.namedtuple('PerfResult', [
  103. 'file', 'function', 'line',
  104. 'cycles', 'bmisses', 'branches', 'cmisses', 'caches',
  105. 'children'])):
  106. _by = ['file', 'function', 'line']
  107. _fields = ['cycles', 'bmisses', 'branches', 'cmisses', 'caches']
  108. _sort = ['cycles', 'bmisses', 'cmisses', 'branches', 'caches']
  109. _types = {
  110. 'cycles': Int,
  111. 'bmisses': Int, 'branches': Int,
  112. 'cmisses': Int, 'caches': Int}
  113. __slots__ = ()
  114. def __new__(cls, file='', function='', line=0,
  115. cycles=0, bmisses=0, branches=0, cmisses=0, caches=0,
  116. children=[]):
  117. return super().__new__(cls, file, function, int(Int(line)),
  118. Int(cycles), Int(bmisses), Int(branches), Int(cmisses), Int(caches),
  119. children)
  120. def __add__(self, other):
  121. return PerfResult(self.file, self.function, self.line,
  122. self.cycles + other.cycles,
  123. self.bmisses + other.bmisses,
  124. self.branches + other.branches,
  125. self.cmisses + other.cmisses,
  126. self.caches + other.caches,
  127. self.children + other.children)
  128. def openio(path, mode='r', buffering=-1):
  129. # allow '-' for stdin/stdout
  130. if path == '-':
  131. if mode == 'r':
  132. return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering)
  133. else:
  134. return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering)
  135. else:
  136. return open(path, mode, buffering)
  137. # run perf as a subprocess, storing measurements into a zip file
  138. def record(command, *,
  139. output=None,
  140. perf_freq=PERF_FREQ,
  141. perf_period=None,
  142. perf_events=PERF_EVENTS,
  143. perf_path=PERF_PATH,
  144. **args):
  145. # create a temporary file for perf to write to, as far as I can tell
  146. # this is strictly needed because perf's pipe-mode only works with stdout
  147. with tempfile.NamedTemporaryFile('rb') as f:
  148. # figure out our perf invocation
  149. perf = perf_path + list(filter(None, [
  150. 'record',
  151. '-F%s' % perf_freq
  152. if perf_freq is not None
  153. and perf_period is None else None,
  154. '-c%s' % perf_period
  155. if perf_period is not None else None,
  156. '-B',
  157. '-g',
  158. '--all-user',
  159. '-e%s' % perf_events,
  160. '-o%s' % f.name]))
  161. # run our command
  162. try:
  163. if args.get('verbose'):
  164. print(' '.join(shlex.quote(c) for c in perf + command))
  165. err = sp.call(perf + command, close_fds=False)
  166. except KeyboardInterrupt:
  167. err = errno.EOWNERDEAD
  168. # synchronize access
  169. z = os.open(output, os.O_RDWR | os.O_CREAT)
  170. fcntl.flock(z, fcntl.LOCK_EX)
  171. # copy measurements into our zip file
  172. with os.fdopen(z, 'r+b') as z:
  173. with zipfile.ZipFile(z, 'a',
  174. compression=zipfile.ZIP_DEFLATED,
  175. compresslevel=1) as z:
  176. with z.open('perf.%d' % os.getpid(), 'w') as g:
  177. shutil.copyfileobj(f, g)
  178. # forward the return code
  179. return err
  180. # try to only process each dso onceS
  181. #
  182. # note this only caches with the non-keyword arguments
  183. def multiprocessing_cache(f):
  184. local_cache = {}
  185. manager = mp.Manager()
  186. global_cache = manager.dict()
  187. lock = mp.Lock()
  188. def multiprocessing_cache(*args, **kwargs):
  189. # check local cache?
  190. if args in local_cache:
  191. return local_cache[args]
  192. # check global cache?
  193. with lock:
  194. if args in global_cache:
  195. v = global_cache[args]
  196. local_cache[args] = v
  197. return v
  198. # fall back to calling the function
  199. v = f(*args, **kwargs)
  200. global_cache[args] = v
  201. local_cache[args] = v
  202. return v
  203. return multiprocessing_cache
  204. @multiprocessing_cache
  205. def collect_syms_and_lines(obj_path, *,
  206. objdump_path=None,
  207. **args):
  208. symbol_pattern = re.compile(
  209. '^(?P<addr>[0-9a-fA-F]+)'
  210. '\s+.*'
  211. '\s+(?P<size>[0-9a-fA-F]+)'
  212. '\s+(?P<name>[^\s]+)\s*$')
  213. line_pattern = re.compile(
  214. '^\s+(?:'
  215. # matches dir/file table
  216. '(?P<no>[0-9]+)'
  217. '(?:\s+(?P<dir>[0-9]+))?'
  218. '\s+.*'
  219. '\s+(?P<path>[^\s]+)'
  220. # matches line opcodes
  221. '|' '\[[^\]]*\]\s+'
  222. '(?:'
  223. '(?P<op_special>Special)'
  224. '|' '(?P<op_copy>Copy)'
  225. '|' '(?P<op_end>End of Sequence)'
  226. '|' 'File .*?to (?:entry )?(?P<op_file>\d+)'
  227. '|' 'Line .*?to (?P<op_line>[0-9]+)'
  228. '|' '(?:Address|PC) .*?to (?P<op_addr>[0x0-9a-fA-F]+)'
  229. '|' '.' ')*'
  230. ')$', re.IGNORECASE)
  231. # figure out symbol addresses and file+line ranges
  232. syms = {}
  233. sym_at = []
  234. cmd = objdump_path + ['-t', obj_path]
  235. if args.get('verbose'):
  236. print(' '.join(shlex.quote(c) for c in cmd))
  237. proc = sp.Popen(cmd,
  238. stdout=sp.PIPE,
  239. stderr=sp.PIPE if not args.get('verbose') else None,
  240. universal_newlines=True,
  241. errors='replace',
  242. close_fds=False)
  243. for line in proc.stdout:
  244. m = symbol_pattern.match(line)
  245. if m:
  246. name = m.group('name')
  247. addr = int(m.group('addr'), 16)
  248. size = int(m.group('size'), 16)
  249. # ignore zero-sized symbols
  250. if not size:
  251. continue
  252. # note multiple symbols can share a name
  253. if name not in syms:
  254. syms[name] = set()
  255. syms[name].add((addr, size))
  256. sym_at.append((addr, name, size))
  257. proc.wait()
  258. if proc.returncode != 0:
  259. if not args.get('verbose'):
  260. for line in proc.stderr:
  261. sys.stdout.write(line)
  262. # assume no debug-info on failure
  263. pass
  264. # sort and keep largest/first when duplicates
  265. sym_at.sort(key=lambda x: (x[0], -x[2], x[1]))
  266. sym_at_ = []
  267. for addr, name, size in sym_at:
  268. if len(sym_at_) == 0 or sym_at_[-1][0] != addr:
  269. sym_at_.append((addr, name, size))
  270. sym_at = sym_at_
  271. # state machine for dwarf line numbers, note that objdump's
  272. # decodedline seems to have issues with multiple dir/file
  273. # tables, which is why we need this
  274. lines = []
  275. line_at = []
  276. dirs = {}
  277. files = {}
  278. op_file = 1
  279. op_line = 1
  280. op_addr = 0
  281. cmd = objdump_path + ['--dwarf=rawline', obj_path]
  282. if args.get('verbose'):
  283. print(' '.join(shlex.quote(c) for c in cmd))
  284. proc = sp.Popen(cmd,
  285. stdout=sp.PIPE,
  286. stderr=sp.PIPE if not args.get('verbose') else None,
  287. universal_newlines=True,
  288. errors='replace',
  289. close_fds=False)
  290. for line in proc.stdout:
  291. m = line_pattern.match(line)
  292. if m:
  293. if m.group('no') and not m.group('dir'):
  294. # found a directory entry
  295. dirs[int(m.group('no'))] = m.group('path')
  296. elif m.group('no'):
  297. # found a file entry
  298. dir = int(m.group('dir'))
  299. if dir in dirs:
  300. files[int(m.group('no'))] = os.path.join(
  301. dirs[dir],
  302. m.group('path'))
  303. else:
  304. files[int(m.group('no'))] = m.group('path')
  305. else:
  306. # found a state machine update
  307. if m.group('op_file'):
  308. op_file = int(m.group('op_file'), 0)
  309. if m.group('op_line'):
  310. op_line = int(m.group('op_line'), 0)
  311. if m.group('op_addr'):
  312. op_addr = int(m.group('op_addr'), 0)
  313. if (m.group('op_special')
  314. or m.group('op_copy')
  315. or m.group('op_end')):
  316. file = os.path.abspath(files.get(op_file, '?'))
  317. lines.append((file, op_line, op_addr))
  318. line_at.append((op_addr, file, op_line))
  319. if m.group('op_end'):
  320. op_file = 1
  321. op_line = 1
  322. op_addr = 0
  323. proc.wait()
  324. if proc.returncode != 0:
  325. if not args.get('verbose'):
  326. for line in proc.stderr:
  327. sys.stdout.write(line)
  328. # assume no debug-info on failure
  329. pass
  330. # sort and keep first when duplicates
  331. lines.sort()
  332. lines_ = []
  333. for file, line, addr in lines:
  334. if len(lines_) == 0 or lines_[-1][0] != file or lines[-1][1] != line:
  335. lines_.append((file, line, addr))
  336. lines = lines_
  337. # sort and keep first when duplicates
  338. line_at.sort()
  339. line_at_ = []
  340. for addr, file, line in line_at:
  341. if len(line_at_) == 0 or line_at_[-1][0] != addr:
  342. line_at_.append((addr, file, line))
  343. line_at = line_at_
  344. return syms, sym_at, lines, line_at
  345. def collect_decompressed(path, *,
  346. perf_path=PERF_PATH,
  347. sources=None,
  348. everything=False,
  349. propagate=0,
  350. depth=1,
  351. **args):
  352. sample_pattern = re.compile(
  353. '(?P<comm>\w+)'
  354. '\s+(?P<pid>\w+)'
  355. '\s+(?P<time>[\w.]+):'
  356. '\s*(?P<period>\w+)'
  357. '\s+(?P<event>[^:]+):')
  358. frame_pattern = re.compile(
  359. '\s+(?P<addr>\w+)'
  360. '\s+(?P<sym>[^\s\+]+)(?:\+(?P<off>\w+))?'
  361. '\s+\((?P<dso>[^\)]+)\)')
  362. events = {
  363. 'cycles': 'cycles',
  364. 'branch-misses': 'bmisses',
  365. 'branches': 'branches',
  366. 'cache-misses': 'cmisses',
  367. 'cache-references': 'caches'}
  368. # note perf_path may contain extra args
  369. cmd = perf_path + [
  370. 'script',
  371. '-i%s' % path]
  372. if args.get('verbose'):
  373. print(' '.join(shlex.quote(c) for c in cmd))
  374. proc = sp.Popen(cmd,
  375. stdout=sp.PIPE,
  376. stderr=sp.PIPE if not args.get('verbose') else None,
  377. universal_newlines=True,
  378. errors='replace',
  379. close_fds=False)
  380. last_filtered = False
  381. last_event = ''
  382. last_period = 0
  383. last_stack = []
  384. deltas = co.defaultdict(lambda: {})
  385. syms_ = co.defaultdict(lambda: {})
  386. at_cache = {}
  387. results = {}
  388. def commit():
  389. # tail-recursively propagate measurements
  390. for i in range(len(last_stack)):
  391. results_ = results
  392. for j in reversed(range(i+1)):
  393. if i+1-j > depth:
  394. break
  395. # propagate
  396. name = last_stack[j]
  397. if name not in results_:
  398. results_[name] = (co.defaultdict(lambda: 0), {})
  399. results_[name][0][last_event] += last_period
  400. # recurse
  401. results_ = results_[name][1]
  402. for line in proc.stdout:
  403. # we need to process a lot of data, so wait to use regex as late
  404. # as possible
  405. if not line.startswith('\t'):
  406. if last_filtered:
  407. commit()
  408. last_filtered = False
  409. if line:
  410. m = sample_pattern.match(line)
  411. if m and m.group('event') in events:
  412. last_filtered = True
  413. last_event = m.group('event')
  414. last_period = int(m.group('period'), 0)
  415. last_stack = []
  416. elif last_filtered:
  417. m = frame_pattern.match(line)
  418. if m:
  419. # filter out internal/kernel functions
  420. if not everything and (
  421. m.group('sym').startswith('__')
  422. or m.group('sym').startswith('0')
  423. or m.group('sym').startswith('-')
  424. or m.group('sym').startswith('[')
  425. or m.group('dso').startswith('/usr/lib')):
  426. continue
  427. dso = m.group('dso')
  428. sym = m.group('sym')
  429. off = int(m.group('off'), 0) if m.group('off') else 0
  430. addr_ = int(m.group('addr'), 16)
  431. # get the syms/lines for the dso, this is cached
  432. syms, sym_at, lines, line_at = collect_syms_and_lines(
  433. dso,
  434. **args)
  435. # ASLR is tricky, we have symbols+offsets, but static symbols
  436. # means we may have multiple options for each symbol.
  437. #
  438. # To try to solve this, we use previous seen symbols to build
  439. # confidence for the correct ASLR delta. This means we may
  440. # guess incorrectly for early symbols, but this will only affect
  441. # a few samples.
  442. if sym in syms:
  443. sym_addr_ = addr_ - off
  444. # track possible deltas?
  445. for sym_addr, size in syms[sym]:
  446. delta = sym_addr - sym_addr_
  447. if delta not in deltas[dso]:
  448. deltas[dso][delta] = sum(
  449. abs(a_+delta - a)
  450. for s, (a_, _) in syms_[dso].items()
  451. for a, _ in syms[s])
  452. for delta in deltas[dso].keys():
  453. deltas[dso][delta] += abs(sym_addr_+delta - sym_addr)
  454. syms_[dso][sym] = sym_addr_, size
  455. # guess the best delta
  456. delta, _ = min(deltas[dso].items(),
  457. key=lambda x: (x[1], x[0]))
  458. addr = addr_ + delta
  459. # cached?
  460. if (dso,addr) in at_cache:
  461. cached = at_cache[(dso,addr)]
  462. if cached is None:
  463. # cache says to skip
  464. continue
  465. file, line = cached
  466. else:
  467. # find file+line
  468. i = bisect.bisect(line_at, addr, key=lambda x: x[0])
  469. if i > 0:
  470. _, file, line = line_at[i-1]
  471. else:
  472. file, line = re.sub('(\.o)?$', '.c', dso, 1), 0
  473. # ignore filtered sources
  474. if sources is not None:
  475. if not any(
  476. os.path.abspath(file) == os.path.abspath(s)
  477. for s in sources):
  478. at_cache[(dso,addr)] = None
  479. continue
  480. else:
  481. # default to only cwd
  482. if not everything and not os.path.commonpath([
  483. os.getcwd(),
  484. os.path.abspath(file)]) == os.getcwd():
  485. at_cache[(dso,addr)] = None
  486. continue
  487. # simplify path
  488. if os.path.commonpath([
  489. os.getcwd(),
  490. os.path.abspath(file)]) == os.getcwd():
  491. file = os.path.relpath(file)
  492. else:
  493. file = os.path.abspath(file)
  494. at_cache[(dso,addr)] = file, line
  495. else:
  496. file, line = re.sub('(\.o)?$', '.c', dso, 1), 0
  497. last_stack.append((file, sym, line))
  498. # stop propogating?
  499. if propagate and len(last_stack) >= propagate:
  500. commit()
  501. last_filtered = False
  502. if last_filtered:
  503. commit()
  504. proc.wait()
  505. if proc.returncode != 0:
  506. if not args.get('verbose'):
  507. for line in proc.stderr:
  508. sys.stdout.write(line)
  509. sys.exit(-1)
  510. # rearrange results into result type
  511. def to_results(results):
  512. results_ = []
  513. for name, (r, children) in results.items():
  514. results_.append(PerfResult(*name,
  515. **{events[k]: v for k, v in r.items()},
  516. children=to_results(children)))
  517. return results_
  518. return to_results(results)
  519. def collect_job(path, i, **args):
  520. # decompress into a temporary file, this is to work around
  521. # some limitations of perf
  522. with zipfile.ZipFile(path) as z:
  523. with z.open(i) as f:
  524. with tempfile.NamedTemporaryFile('wb') as g:
  525. shutil.copyfileobj(f, g)
  526. g.flush()
  527. return collect_decompressed(g.name, **args)
  528. def starapply(args):
  529. f, args, kwargs = args
  530. return f(*args, **kwargs)
  531. def collect(perf_paths, *,
  532. jobs=None,
  533. **args):
  534. # automatic job detection?
  535. if jobs == 0:
  536. jobs = len(os.sched_getaffinity(0))
  537. records = []
  538. for path in perf_paths:
  539. # each .perf file is actually a zip file containing perf files from
  540. # multiple runs
  541. with zipfile.ZipFile(path) as z:
  542. records.extend((path, i) for i in z.infolist())
  543. # we're dealing with a lot of data but also surprisingly
  544. # parallelizable
  545. if jobs is not None:
  546. results = []
  547. with mp.Pool(jobs) as p:
  548. for results_ in p.imap_unordered(
  549. starapply,
  550. ((collect_job, (path, i), args) for path, i in records)):
  551. results.extend(results_)
  552. else:
  553. results = []
  554. for path, i in records:
  555. results.extend(collect_job(path, i, **args))
  556. return results
  557. def fold(Result, results, *,
  558. by=None,
  559. defines=None,
  560. **_):
  561. if by is None:
  562. by = Result._by
  563. for k in it.chain(by or [], (k for k, _ in defines or [])):
  564. if k not in Result._by and k not in Result._fields:
  565. print("error: could not find field %r?" % k)
  566. sys.exit(-1)
  567. # filter by matching defines
  568. if defines is not None:
  569. results_ = []
  570. for r in results:
  571. if all(getattr(r, k) in vs for k, vs in defines):
  572. results_.append(r)
  573. results = results_
  574. # organize results into conflicts
  575. folding = co.OrderedDict()
  576. for r in results:
  577. name = tuple(getattr(r, k) for k in by)
  578. if name not in folding:
  579. folding[name] = []
  580. folding[name].append(r)
  581. # merge conflicts
  582. folded = []
  583. for name, rs in folding.items():
  584. folded.append(sum(rs[1:], start=rs[0]))
  585. # fold recursively
  586. folded_ = []
  587. for r in folded:
  588. folded_.append(r._replace(children=fold(
  589. Result, r.children,
  590. by=by,
  591. defines=defines)))
  592. folded = folded_
  593. return folded
  594. def table(Result, results, diff_results=None, *,
  595. by=None,
  596. fields=None,
  597. sort=None,
  598. summary=False,
  599. all=False,
  600. percent=False,
  601. depth=1,
  602. **_):
  603. all_, all = all, __builtins__.all
  604. if by is None:
  605. by = Result._by
  606. if fields is None:
  607. fields = Result._fields
  608. types = Result._types
  609. # fold again
  610. results = fold(Result, results, by=by)
  611. if diff_results is not None:
  612. diff_results = fold(Result, diff_results, by=by)
  613. # organize by name
  614. table = {
  615. ','.join(str(getattr(r, k) or '') for k in by): r
  616. for r in results}
  617. diff_table = {
  618. ','.join(str(getattr(r, k) or '') for k in by): r
  619. for r in diff_results or []}
  620. names = list(table.keys() | diff_table.keys())
  621. # sort again, now with diff info, note that python's sort is stable
  622. names.sort()
  623. if diff_results is not None:
  624. names.sort(key=lambda n: tuple(
  625. types[k].ratio(
  626. getattr(table.get(n), k, None),
  627. getattr(diff_table.get(n), k, None))
  628. for k in fields),
  629. reverse=True)
  630. if sort:
  631. for k, reverse in reversed(sort):
  632. names.sort(
  633. key=lambda n: tuple(
  634. (getattr(table[n], k),)
  635. if getattr(table.get(n), k, None) is not None else ()
  636. for k in ([k] if k else [
  637. k for k in Result._sort if k in fields])),
  638. reverse=reverse ^ (not k or k in Result._fields))
  639. # build up our lines
  640. lines = []
  641. # header
  642. header = []
  643. header.append('%s%s' % (
  644. ','.join(by),
  645. ' (%d added, %d removed)' % (
  646. sum(1 for n in table if n not in diff_table),
  647. sum(1 for n in diff_table if n not in table))
  648. if diff_results is not None and not percent else '')
  649. if not summary else '')
  650. if diff_results is None:
  651. for k in fields:
  652. header.append(k)
  653. elif percent:
  654. for k in fields:
  655. header.append(k)
  656. else:
  657. for k in fields:
  658. header.append('o'+k)
  659. for k in fields:
  660. header.append('n'+k)
  661. for k in fields:
  662. header.append('d'+k)
  663. header.append('')
  664. lines.append(header)
  665. def table_entry(name, r, diff_r=None, ratios=[]):
  666. entry = []
  667. entry.append(name)
  668. if diff_results is None:
  669. for k in fields:
  670. entry.append(getattr(r, k).table()
  671. if getattr(r, k, None) is not None
  672. else types[k].none)
  673. elif percent:
  674. for k in fields:
  675. entry.append(getattr(r, k).diff_table()
  676. if getattr(r, k, None) is not None
  677. else types[k].diff_none)
  678. else:
  679. for k in fields:
  680. entry.append(getattr(diff_r, k).diff_table()
  681. if getattr(diff_r, k, None) is not None
  682. else types[k].diff_none)
  683. for k in fields:
  684. entry.append(getattr(r, k).diff_table()
  685. if getattr(r, k, None) is not None
  686. else types[k].diff_none)
  687. for k in fields:
  688. entry.append(types[k].diff_diff(
  689. getattr(r, k, None),
  690. getattr(diff_r, k, None)))
  691. if diff_results is None:
  692. entry.append('')
  693. elif percent:
  694. entry.append(' (%s)' % ', '.join(
  695. '+∞%' if t == +m.inf
  696. else '-∞%' if t == -m.inf
  697. else '%+.1f%%' % (100*t)
  698. for t in ratios))
  699. else:
  700. entry.append(' (%s)' % ', '.join(
  701. '+∞%' if t == +m.inf
  702. else '-∞%' if t == -m.inf
  703. else '%+.1f%%' % (100*t)
  704. for t in ratios
  705. if t)
  706. if any(ratios) else '')
  707. return entry
  708. # entries
  709. if not summary:
  710. for name in names:
  711. r = table.get(name)
  712. if diff_results is None:
  713. diff_r = None
  714. ratios = None
  715. else:
  716. diff_r = diff_table.get(name)
  717. ratios = [
  718. types[k].ratio(
  719. getattr(r, k, None),
  720. getattr(diff_r, k, None))
  721. for k in fields]
  722. if not all_ and not any(ratios):
  723. continue
  724. lines.append(table_entry(name, r, diff_r, ratios))
  725. # total
  726. r = next(iter(fold(Result, results, by=[])), None)
  727. if diff_results is None:
  728. diff_r = None
  729. ratios = None
  730. else:
  731. diff_r = next(iter(fold(Result, diff_results, by=[])), None)
  732. ratios = [
  733. types[k].ratio(
  734. getattr(r, k, None),
  735. getattr(diff_r, k, None))
  736. for k in fields]
  737. lines.append(table_entry('TOTAL', r, diff_r, ratios))
  738. # find the best widths, note that column 0 contains the names and column -1
  739. # the ratios, so those are handled a bit differently
  740. widths = [
  741. ((max(it.chain([w], (len(l[i]) for l in lines)))+1+4-1)//4)*4-1
  742. for w, i in zip(
  743. it.chain([23], it.repeat(7)),
  744. range(len(lines[0])-1))]
  745. # adjust the name width based on the expected call depth, though
  746. # note this doesn't really work with unbounded recursion
  747. if not summary and not m.isinf(depth):
  748. widths[0] += 4*(depth-1)
  749. # print the tree recursively
  750. print('%-*s %s%s' % (
  751. widths[0], lines[0][0],
  752. ' '.join('%*s' % (w, x)
  753. for w, x in zip(widths[1:], lines[0][1:-1])),
  754. lines[0][-1]))
  755. if not summary:
  756. def recurse(results_, depth_, prefixes=('', '', '', '')):
  757. # rebuild our tables at each layer
  758. table_ = {
  759. ','.join(str(getattr(r, k) or '') for k in by): r
  760. for r in results_}
  761. names_ = list(table_.keys())
  762. # sort again at each layer, keep in mind the numbers are
  763. # changing as we descend
  764. names_.sort()
  765. if sort:
  766. for k, reverse in reversed(sort):
  767. names_.sort(
  768. key=lambda n: tuple(
  769. (getattr(table_[n], k),)
  770. if getattr(table_.get(n), k, None) is not None
  771. else ()
  772. for k in ([k] if k else [
  773. k for k in Result._sort if k in fields])),
  774. reverse=reverse ^ (not k or k in Result._fields))
  775. for i, name in enumerate(names_):
  776. r = table_[name]
  777. is_last = (i == len(names_)-1)
  778. print('%s%-*s %s' % (
  779. prefixes[0+is_last],
  780. widths[0] - (
  781. len(prefixes[0+is_last])
  782. if not m.isinf(depth) else 0),
  783. name,
  784. ' '.join('%*s' % (w, x)
  785. for w, x in zip(
  786. widths[1:],
  787. table_entry(name, r)[1:]))))
  788. # recurse?
  789. if depth_ > 1:
  790. recurse(
  791. r.children,
  792. depth_-1,
  793. (prefixes[2+is_last] + "|-> ",
  794. prefixes[2+is_last] + "'-> ",
  795. prefixes[2+is_last] + "| ",
  796. prefixes[2+is_last] + " "))
  797. # we have enough going on with diffing to make the top layer
  798. # a special case
  799. for name, line in zip(names, lines[1:-1]):
  800. print('%-*s %s%s' % (
  801. widths[0], line[0],
  802. ' '.join('%*s' % (w, x)
  803. for w, x in zip(widths[1:], line[1:-1])),
  804. line[-1]))
  805. if name in table and depth > 1:
  806. recurse(
  807. table[name].children,
  808. depth-1,
  809. ("|-> ",
  810. "'-> ",
  811. "| ",
  812. " "))
  813. print('%-*s %s%s' % (
  814. widths[0], lines[-1][0],
  815. ' '.join('%*s' % (w, x)
  816. for w, x in zip(widths[1:], lines[-1][1:-1])),
  817. lines[-1][-1]))
  818. def annotate(Result, results, *,
  819. annotate=None,
  820. threshold=None,
  821. branches=False,
  822. caches=False,
  823. **args):
  824. # figure out the threshold
  825. if threshold is None:
  826. t0, t1 = THRESHOLD
  827. elif len(threshold) == 1:
  828. t0, t1 = threshold[0], threshold[0]
  829. else:
  830. t0, t1 = threshold
  831. t0, t1 = min(t0, t1), max(t0, t1)
  832. if not branches and not caches:
  833. tk = 'cycles'
  834. elif branches:
  835. tk = 'bmisses'
  836. else:
  837. tk = 'cmisses'
  838. # find max cycles
  839. max_ = max(it.chain((float(getattr(r, tk)) for r in results), [1]))
  840. for path in co.OrderedDict.fromkeys(r.file for r in results).keys():
  841. # flatten to line info
  842. results = fold(Result, results, by=['file', 'line'])
  843. table = {r.line: r for r in results if r.file == path}
  844. # calculate spans to show
  845. if not annotate:
  846. spans = []
  847. last = None
  848. func = None
  849. for line, r in sorted(table.items()):
  850. if float(getattr(r, tk)) / max_ >= t0:
  851. if last is not None and line - last.stop <= args['context']:
  852. last = range(
  853. last.start,
  854. line+1+args['context'])
  855. else:
  856. if last is not None:
  857. spans.append((last, func))
  858. last = range(
  859. line-args['context'],
  860. line+1+args['context'])
  861. func = r.function
  862. if last is not None:
  863. spans.append((last, func))
  864. with open(path) as f:
  865. skipped = False
  866. for i, line in enumerate(f):
  867. # skip lines not in spans?
  868. if not annotate and not any(i+1 in s for s, _ in spans):
  869. skipped = True
  870. continue
  871. if skipped:
  872. skipped = False
  873. print('%s@@ %s:%d: %s @@%s' % (
  874. '\x1b[36m' if args['color'] else '',
  875. path,
  876. i+1,
  877. next(iter(f for _, f in spans)),
  878. '\x1b[m' if args['color'] else ''))
  879. # build line
  880. if line.endswith('\n'):
  881. line = line[:-1]
  882. r = table.get(i+1)
  883. if r is not None and (
  884. float(r.cycles) > 0
  885. if not branches and not caches
  886. else float(r.bmisses) > 0 or float(r.branches) > 0
  887. if branches
  888. else float(r.cmisses) > 0 or float(r.caches) > 0):
  889. line = '%-*s // %s' % (
  890. args['width'],
  891. line,
  892. '%s cycles' % r.cycles
  893. if not branches and not caches
  894. else '%s bmisses, %s branches' % (r.bmisses, r.branches)
  895. if branches
  896. else '%s cmisses, %s caches' % (r.cmisses, r.caches))
  897. if args['color']:
  898. if float(getattr(r, tk)) / max_ >= t1:
  899. line = '\x1b[1;31m%s\x1b[m' % line
  900. elif float(getattr(r, tk)) / max_ >= t0:
  901. line = '\x1b[35m%s\x1b[m' % line
  902. print(line)
  903. def report(perf_paths, *,
  904. by=None,
  905. fields=None,
  906. defines=None,
  907. sort=None,
  908. branches=False,
  909. caches=False,
  910. **args):
  911. # figure out what color should be
  912. if args.get('color') == 'auto':
  913. args['color'] = sys.stdout.isatty()
  914. elif args.get('color') == 'always':
  915. args['color'] = True
  916. else:
  917. args['color'] = False
  918. # depth of 0 == m.inf
  919. if args.get('depth') == 0:
  920. args['depth'] = m.inf
  921. # find sizes
  922. if not args.get('use', None):
  923. results = collect(perf_paths, **args)
  924. else:
  925. results = []
  926. with openio(args['use']) as f:
  927. reader = csv.DictReader(f, restval='')
  928. for r in reader:
  929. if not any('perf_'+k in r and r['perf_'+k].strip()
  930. for k in PerfResult._fields):
  931. continue
  932. try:
  933. results.append(PerfResult(
  934. **{k: r[k] for k in PerfResult._by
  935. if k in r and r[k].strip()},
  936. **{k: r['perf_'+k] for k in PerfResult._fields
  937. if 'perf_'+k in r and r['perf_'+k].strip()}))
  938. except TypeError:
  939. pass
  940. # fold
  941. results = fold(PerfResult, results, by=by, defines=defines)
  942. # sort, note that python's sort is stable
  943. results.sort()
  944. if sort:
  945. for k, reverse in reversed(sort):
  946. results.sort(
  947. key=lambda r: tuple(
  948. (getattr(r, k),) if getattr(r, k) is not None else ()
  949. for k in ([k] if k else PerfResult._sort)),
  950. reverse=reverse ^ (not k or k in PerfResult._fields))
  951. # write results to CSV
  952. if args.get('output'):
  953. with openio(args['output'], 'w') as f:
  954. writer = csv.DictWriter(f,
  955. (by if by is not None else PerfResult._by)
  956. + ['perf_'+k for k in (
  957. fields if fields is not None else PerfResult._fields)])
  958. writer.writeheader()
  959. for r in results:
  960. writer.writerow(
  961. {k: getattr(r, k) for k in (
  962. by if by is not None else PerfResult._by)}
  963. | {'perf_'+k: getattr(r, k) for k in (
  964. fields if fields is not None else PerfResult._fields)})
  965. # find previous results?
  966. if args.get('diff'):
  967. diff_results = []
  968. try:
  969. with openio(args['diff']) as f:
  970. reader = csv.DictReader(f, restval='')
  971. for r in reader:
  972. if not any('perf_'+k in r and r['perf_'+k].strip()
  973. for k in PerfResult._fields):
  974. continue
  975. try:
  976. diff_results.append(PerfResult(
  977. **{k: r[k] for k in PerfResult._by
  978. if k in r and r[k].strip()},
  979. **{k: r['perf_'+k] for k in PerfResult._fields
  980. if 'perf_'+k in r and r['perf_'+k].strip()}))
  981. except TypeError:
  982. pass
  983. except FileNotFoundError:
  984. pass
  985. # fold
  986. diff_results = fold(PerfResult, diff_results, by=by, defines=defines)
  987. # print table
  988. if not args.get('quiet'):
  989. if args.get('annotate') or args.get('threshold'):
  990. # annotate sources
  991. annotate(PerfResult, results,
  992. branches=branches,
  993. caches=caches,
  994. **args)
  995. else:
  996. # print table
  997. table(PerfResult, results,
  998. diff_results if args.get('diff') else None,
  999. by=by if by is not None else ['function'],
  1000. fields=fields if fields is not None
  1001. else ['cycles'] if not branches and not caches
  1002. else ['bmisses', 'branches'] if branches
  1003. else ['cmisses', 'caches'],
  1004. sort=sort,
  1005. **args)
  1006. def main(**args):
  1007. if args.get('record'):
  1008. return record(**args)
  1009. else:
  1010. return report(**args)
  1011. if __name__ == "__main__":
  1012. import argparse
  1013. import sys
  1014. # bit of a hack, but parse_intermixed_args and REMAINDER are
  1015. # incompatible, so we need to figure out what we want before running
  1016. # argparse
  1017. if '-R' in sys.argv or '--record' in sys.argv:
  1018. nargs = argparse.REMAINDER
  1019. else:
  1020. nargs = '*'
  1021. argparse.ArgumentParser._handle_conflict_ignore = lambda *_: None
  1022. argparse._ArgumentGroup._handle_conflict_ignore = lambda *_: None
  1023. parser = argparse.ArgumentParser(
  1024. description="Aggregate and report Linux perf results.",
  1025. allow_abbrev=False,
  1026. conflict_handler='ignore')
  1027. parser.add_argument(
  1028. 'perf_paths',
  1029. nargs=nargs,
  1030. help="Input *.perf files.")
  1031. parser.add_argument(
  1032. '-v', '--verbose',
  1033. action='store_true',
  1034. help="Output commands that run behind the scenes.")
  1035. parser.add_argument(
  1036. '-q', '--quiet',
  1037. action='store_true',
  1038. help="Don't show anything, useful with -o.")
  1039. parser.add_argument(
  1040. '-o', '--output',
  1041. help="Specify CSV file to store results.")
  1042. parser.add_argument(
  1043. '-u', '--use',
  1044. help="Don't parse anything, use this CSV file.")
  1045. parser.add_argument(
  1046. '-d', '--diff',
  1047. help="Specify CSV file to diff against.")
  1048. parser.add_argument(
  1049. '-a', '--all',
  1050. action='store_true',
  1051. help="Show all, not just the ones that changed.")
  1052. parser.add_argument(
  1053. '-p', '--percent',
  1054. action='store_true',
  1055. help="Only show percentage change, not a full diff.")
  1056. parser.add_argument(
  1057. '-b', '--by',
  1058. action='append',
  1059. choices=PerfResult._by,
  1060. help="Group by this field.")
  1061. parser.add_argument(
  1062. '-f', '--field',
  1063. dest='fields',
  1064. action='append',
  1065. choices=PerfResult._fields,
  1066. help="Show this field.")
  1067. parser.add_argument(
  1068. '-D', '--define',
  1069. dest='defines',
  1070. action='append',
  1071. type=lambda x: (lambda k,v: (k, set(v.split(','))))(*x.split('=', 1)),
  1072. help="Only include results where this field is this value.")
  1073. class AppendSort(argparse.Action):
  1074. def __call__(self, parser, namespace, value, option):
  1075. if namespace.sort is None:
  1076. namespace.sort = []
  1077. namespace.sort.append((value, True if option == '-S' else False))
  1078. parser.add_argument(
  1079. '-s', '--sort',
  1080. nargs='?',
  1081. action=AppendSort,
  1082. help="Sort by this field.")
  1083. parser.add_argument(
  1084. '-S', '--reverse-sort',
  1085. nargs='?',
  1086. action=AppendSort,
  1087. help="Sort by this field, but backwards.")
  1088. parser.add_argument(
  1089. '-Y', '--summary',
  1090. action='store_true',
  1091. help="Only show the total.")
  1092. parser.add_argument(
  1093. '-F', '--source',
  1094. dest='sources',
  1095. action='append',
  1096. help="Only consider definitions in this file. Defaults to anything "
  1097. "in the current directory.")
  1098. parser.add_argument(
  1099. '--everything',
  1100. action='store_true',
  1101. help="Include builtin and libc specific symbols.")
  1102. parser.add_argument(
  1103. '--branches',
  1104. action='store_true',
  1105. help="Show branches and branch misses.")
  1106. parser.add_argument(
  1107. '--caches',
  1108. action='store_true',
  1109. help="Show cache accesses and cache misses.")
  1110. parser.add_argument(
  1111. '-P', '--propagate',
  1112. type=lambda x: int(x, 0),
  1113. help="Depth to propagate samples up the call-stack. 0 propagates up "
  1114. "to the entry point, 1 does no propagation. Defaults to 0.")
  1115. parser.add_argument(
  1116. '-Z', '--depth',
  1117. nargs='?',
  1118. type=lambda x: int(x, 0),
  1119. const=0,
  1120. help="Depth of function calls to show. 0 shows all calls but may not "
  1121. "terminate!")
  1122. parser.add_argument(
  1123. '-A', '--annotate',
  1124. action='store_true',
  1125. help="Show source files annotated with coverage info.")
  1126. parser.add_argument(
  1127. '-T', '--threshold',
  1128. nargs='?',
  1129. type=lambda x: tuple(float(x) for x in x.split(',')),
  1130. const=THRESHOLD,
  1131. help="Show lines with samples above this threshold as a percent of "
  1132. "all lines. Defaults to %s." % ','.join(str(t) for t in THRESHOLD))
  1133. parser.add_argument(
  1134. '-c', '--context',
  1135. type=lambda x: int(x, 0),
  1136. default=3,
  1137. help="Show n additional lines of context. Defaults to 3.")
  1138. parser.add_argument(
  1139. '-W', '--width',
  1140. type=lambda x: int(x, 0),
  1141. default=80,
  1142. help="Assume source is styled with this many columns. Defaults to 80.")
  1143. parser.add_argument(
  1144. '--color',
  1145. choices=['never', 'always', 'auto'],
  1146. default='auto',
  1147. help="When to use terminal colors. Defaults to 'auto'.")
  1148. parser.add_argument(
  1149. '-j', '--jobs',
  1150. nargs='?',
  1151. type=lambda x: int(x, 0),
  1152. const=0,
  1153. help="Number of processes to use. 0 spawns one process per core.")
  1154. parser.add_argument(
  1155. '--perf-path',
  1156. type=lambda x: x.split(),
  1157. help="Path to the perf executable, may include flags. "
  1158. "Defaults to %r." % PERF_PATH)
  1159. parser.add_argument(
  1160. '--objdump-path',
  1161. type=lambda x: x.split(),
  1162. default=OBJDUMP_PATH,
  1163. help="Path to the objdump executable, may include flags. "
  1164. "Defaults to %r." % OBJDUMP_PATH)
  1165. # record flags
  1166. record_parser = parser.add_argument_group('record options')
  1167. record_parser.add_argument(
  1168. 'command',
  1169. nargs=nargs,
  1170. help="Command to run.")
  1171. record_parser.add_argument(
  1172. '-R', '--record',
  1173. action='store_true',
  1174. help="Run a command and aggregate perf measurements.")
  1175. record_parser.add_argument(
  1176. '-o', '--output',
  1177. help="Output file. Uses flock to synchronize. This is stored as a "
  1178. "zip-file of multiple perf results.")
  1179. record_parser.add_argument(
  1180. '--perf-freq',
  1181. help="perf sampling frequency. This is passed directly to perf. "
  1182. "Defaults to %r." % PERF_FREQ)
  1183. record_parser.add_argument(
  1184. '--perf-period',
  1185. help="perf sampling period. This is passed directly to perf.")
  1186. record_parser.add_argument(
  1187. '--perf-events',
  1188. help="perf events to record. This is passed directly to perf. "
  1189. "Defaults to %r." % PERF_EVENTS)
  1190. record_parser.add_argument(
  1191. '--perf-path',
  1192. type=lambda x: x.split(),
  1193. help="Path to the perf executable, may include flags. "
  1194. "Defaults to %r." % PERF_PATH)
  1195. # avoid intermixed/REMAINDER conflict, see above
  1196. if nargs == argparse.REMAINDER:
  1197. args = parser.parse_args()
  1198. else:
  1199. args = parser.parse_intermixed_args()
  1200. # perf_paths/command overlap, so need to do some munging here
  1201. args.command = args.perf_paths
  1202. if args.record:
  1203. if not args.command:
  1204. print('error: no command specified?')
  1205. sys.exit(-1)
  1206. if not args.output:
  1207. print('error: no output file specified?')
  1208. sys.exit(-1)
  1209. sys.exit(main(**{k: v
  1210. for k, v in vars(args).items()
  1211. if v is not None}))