summary.py 22 KB


  1. #!/usr/bin/env python3
  2. #
  3. # Script to summarize the outputs of other scripts. Operates on CSV files.
  4. #
  5. # Example:
  6. # ./scripts/code.py lfs.o lfs_util.o -q -o lfs.code.csv
  7. # ./scripts/data.py lfs.o lfs_util.o -q -o lfs.data.csv
  8. # ./scripts/summary.py lfs.code.csv lfs.data.csv -q -o lfs.csv
  9. # ./scripts/summary.py -Y lfs.csv -f code=code_size,data=data_size
  10. #
  11. # Copyright (c) 2022, The littlefs authors.
  12. # SPDX-License-Identifier: BSD-3-Clause
  13. #
  14. import collections as co
  15. import csv
  16. import functools as ft
  17. import glob
  18. import itertools as it
  19. import math as m
  20. import os
  21. import re
  22. CSV_PATHS = ['*.csv']
  23. # supported merge operations
  24. OPS = {
  25. 'add': lambda xs: sum(xs[1:], start=xs[0]),
  26. 'mul': lambda xs: m.prod(xs[1:], start=xs[0]),
  27. 'min': min,
  28. 'max': max,
  29. 'avg': lambda xs: sum(xs[1:], start=xs[0]) / len(xs),
  30. }
  31. def openio(path, mode='r'):
  32. if path == '-':
  33. if mode == 'r':
  34. return os.fdopen(os.dup(sys.stdin.fileno()), 'r')
  35. else:
  36. return os.fdopen(os.dup(sys.stdout.fileno()), 'w')
  37. else:
  38. return open(path, mode)
  39. # integer fields
  40. class IntField(co.namedtuple('IntField', 'x')):
  41. __slots__ = ()
  42. def __new__(cls, x):
  43. if isinstance(x, IntField):
  44. return x
  45. if isinstance(x, str):
  46. try:
  47. x = int(x, 0)
  48. except ValueError:
  49. # also accept +-∞ and +-inf
  50. if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
  51. x = float('inf')
  52. elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
  53. x = float('-inf')
  54. else:
  55. raise
  56. return super().__new__(cls, x)
  57. def __int__(self):
  58. assert not m.isinf(self.x)
  59. return self.x
  60. def __float__(self):
  61. return float(self.x)
  62. def __str__(self):
  63. if self.x == float('inf'):
  64. return '∞'
  65. elif self.x == float('-inf'):
  66. return '-∞'
  67. else:
  68. return str(self.x)
  69. none = '%7s' % '-'
  70. def table(self):
  71. return '%7s' % (self,)
  72. diff_none = '%7s' % '-'
  73. diff_table = table
  74. def diff_diff(self, other):
  75. new = self.x if self else 0
  76. old = other.x if other else 0
  77. diff = new - old
  78. if diff == float('+inf'):
  79. return '%7s' % '+∞'
  80. elif diff == float('-inf'):
  81. return '%7s' % '-∞'
  82. else:
  83. return '%+7d' % diff
  84. def ratio(self, other):
  85. new = self.x if self else 0
  86. old = other.x if other else 0
  87. if m.isinf(new) and m.isinf(old):
  88. return 0.0
  89. elif m.isinf(new):
  90. return float('+inf')
  91. elif m.isinf(old):
  92. return float('-inf')
  93. elif not old and not new:
  94. return 0.0
  95. elif not old:
  96. return 1.0
  97. else:
  98. return (new-old) / old
  99. def __add__(self, other):
  100. return IntField(self.x + other.x)
  101. def __mul__(self, other):
  102. return IntField(self.x * other.x)
  103. def __lt__(self, other):
  104. return self.x < other.x
  105. def __gt__(self, other):
  106. return self.__class__.__lt__(other, self)
  107. def __le__(self, other):
  108. return not self.__gt__(other)
  109. def __ge__(self, other):
  110. return not self.__lt__(other)
  111. def __truediv__(self, n):
  112. if m.isinf(self.x):
  113. return self
  114. else:
  115. return IntField(round(self.x / n))
  116. # float fields
  117. class FloatField(co.namedtuple('FloatField', 'x')):
  118. __slots__ = ()
  119. def __new__(cls, x):
  120. if isinstance(x, FloatField):
  121. return x
  122. if isinstance(x, str):
  123. try:
  124. x = float(x)
  125. except ValueError:
  126. # also accept +-∞ and +-inf
  127. if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
  128. x = float('inf')
  129. elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
  130. x = float('-inf')
  131. else:
  132. raise
  133. return super().__new__(cls, x)
  134. def __float__(self):
  135. return float(self.x)
  136. def __str__(self):
  137. if self.x == float('inf'):
  138. return '∞'
  139. elif self.x == float('-inf'):
  140. return '-∞'
  141. else:
  142. return '%.1f' % self.x
  143. none = IntField.none
  144. table = IntField.table
  145. diff_none = IntField.diff_none
  146. diff_table = IntField.diff_table
  147. diff_diff = IntField.diff_diff
  148. ratio = IntField.ratio
  149. __add__ = IntField.__add__
  150. __mul__ = IntField.__mul__
  151. __lt__ = IntField.__lt__
  152. __gt__ = IntField.__gt__
  153. __le__ = IntField.__le__
  154. __ge__ = IntField.__ge__
  155. def __truediv__(self, n):
  156. if m.isinf(self.x):
  157. return self
  158. else:
  159. return FloatField(self.x / n)
  160. # fractional fields, a/b
  161. class FracField(co.namedtuple('FracField', 'a,b')):
  162. __slots__ = ()
  163. def __new__(cls, a, b=None):
  164. if isinstance(a, FracField) and b is None:
  165. return a
  166. if isinstance(a, str) and b is None:
  167. a, b = a.split('/', 1)
  168. if b is None:
  169. b = a
  170. return super().__new__(cls, IntField(a), IntField(b))
  171. def __str__(self):
  172. return '%s/%s' % (self.a, self.b)
  173. none = '%11s %7s' % ('-', '-')
  174. def table(self):
  175. if not self.b.x:
  176. return self.none
  177. t = self.a.x/self.b.x
  178. return '%11s %7s' % (
  179. self,
  180. '∞%' if t == float('+inf')
  181. else '-∞%' if t == float('-inf')
  182. else '%.1f%%' % (100*t))
  183. diff_none = '%11s' % '-'
  184. def diff_table(self):
  185. if not self.b.x:
  186. return self.diff_none
  187. return '%11s' % (self,)
  188. def diff_diff(self, other):
  189. new_a, new_b = self if self else (IntField(0), IntField(0))
  190. old_a, old_b = other if other else (IntField(0), IntField(0))
  191. return '%11s' % ('%s/%s' % (
  192. new_a.diff_diff(old_a).strip(),
  193. new_b.diff_diff(old_b).strip()))
  194. def ratio(self, other):
  195. new_a, new_b = self if self else (IntField(0), IntField(0))
  196. old_a, old_b = other if other else (IntField(0), IntField(0))
  197. new = new_a.x/new_b.x if new_b.x else 1.0
  198. old = old_a.x/old_b.x if old_b.x else 1.0
  199. return new - old
  200. def __add__(self, other):
  201. return FracField(self.a + other.a, self.b + other.b)
  202. def __mul__(self, other):
  203. return FracField(self.a * other.a, self.b + other.b)
  204. def __lt__(self, other):
  205. self_r = self.a.x/self.b.x if self.b.x else float('-inf')
  206. other_r = other.a.x/other.b.x if other.b.x else float('-inf')
  207. return self_r < other_r
  208. def __gt__(self, other):
  209. return self.__class__.__lt__(other, self)
  210. def __le__(self, other):
  211. return not self.__gt__(other)
  212. def __ge__(self, other):
  213. return not self.__lt__(other)
  214. def __truediv__(self, n):
  215. return FracField(self.a / n, self.b / n)
  216. # available types
  217. TYPES = [IntField, FloatField, FracField]
  218. def homogenize(results, *,
  219. by=None,
  220. fields=None,
  221. renames=[],
  222. define={},
  223. types=None,
  224. **_):
  225. results = results.copy()
  226. # rename fields?
  227. if renames:
  228. for r in results:
  229. # make a copy so renames can overlap
  230. r_ = {}
  231. for new_k, old_k in renames:
  232. if old_k in r:
  233. r_[new_k] = r[old_k]
  234. r.update(r_)
  235. # filter by matching defines
  236. if define:
  237. results_ = []
  238. for r in results:
  239. if all(k in r and r[k] in vs for k, vs in define):
  240. results_.append(r)
  241. results = results_
  242. # if fields not specified, try to guess from data
  243. if fields is None:
  244. fields = co.OrderedDict()
  245. for r in results:
  246. for k, v in r.items():
  247. if by is not None and k in by:
  248. continue
  249. types_ = []
  250. for type in fields.get(k, TYPES):
  251. try:
  252. type(v)
  253. types_.append(type)
  254. except ValueError:
  255. pass
  256. fields[k] = types_
  257. fields = list(k for k,v in fields.items() if v)
  258. # infer 'by' fields?
  259. if by is None:
  260. by = co.OrderedDict()
  261. for r in results:
  262. # also ignore None keys, these are introduced by csv.DictReader
  263. # when header + row mismatch
  264. by.update((k, True) for k in r.keys()
  265. if k is not None
  266. and k not in fields
  267. and not any(k == old_k for _, old_k in renames))
  268. by = list(by.keys())
  269. # go ahead and clean up none values, these can have a few forms
  270. results_ = []
  271. for r in results:
  272. results_.append({
  273. k: r[k] for k in it.chain(by, fields)
  274. if r.get(k) is not None and not (
  275. isinstance(r[k], str)
  276. and re.match('^\s*[+-]?\s*$', r[k]))})
  277. results = results_
  278. # find best type for all fields
  279. if types is None:
  280. def is_type(x, type):
  281. try:
  282. type(x)
  283. return True
  284. except ValueError:
  285. return False
  286. types = {}
  287. for k in fields:
  288. for type in TYPES:
  289. if all(k not in r or is_type(r[k], type) for r in results_):
  290. types[k] = type
  291. break
  292. else:
  293. print("no type matches field %r?" % k)
  294. sys.exit(-1)
  295. # homogenize types
  296. for r in results:
  297. for k in fields:
  298. if k in r:
  299. r[k] = types[k](r[k])
  300. return by, fields, types, results
  301. def fold(results, *,
  302. by=[],
  303. fields=[],
  304. ops={},
  305. **_):
  306. folding = co.OrderedDict()
  307. for r in results:
  308. name = tuple(r.get(k, '') for k in by)
  309. if name not in folding:
  310. folding[name] = {k: [] for k in fields}
  311. for k in fields:
  312. if k in r:
  313. folding[name][k].append(r[k])
  314. # merge fields, we need the count at this point for averages
  315. folded = []
  316. for name, r in folding.items():
  317. r_ = {}
  318. for k, vs in r.items():
  319. if vs:
  320. # sum fields by default
  321. op = OPS[ops.get(k, 'add')]
  322. r_[k] = op(vs)
  323. # drop any rows without fields and any empty keys
  324. if r_:
  325. folded.append(dict(
  326. {k: v for k, v in zip(by, name) if v},
  327. **r_))
  328. return folded
  329. def table(results, diff_results=None, *,
  330. by=None,
  331. fields=None,
  332. types=None,
  333. ops=None,
  334. sort=None,
  335. reverse_sort=None,
  336. summary=False,
  337. all=False,
  338. percent=False,
  339. **_):
  340. all_, all = all, __builtins__.all
  341. table = {tuple(r.get(k,'') for k in by): r for r in results}
  342. diff_table = {tuple(r.get(k,'') for k in by): r for r in diff_results or []}
  343. # sort, note that python's sort is stable
  344. names = list(table.keys() | diff_table.keys())
  345. names.sort()
  346. if diff_results is not None:
  347. names.sort(key=lambda n: tuple(
  348. -types[k].ratio(
  349. table.get(n,{}).get(k),
  350. diff_table.get(n,{}).get(k))
  351. for k in fields))
  352. if sort:
  353. names.sort(key=lambda n: tuple(
  354. (table[n][k],) if k in table.get(n,{}) else ()
  355. for k in sort),
  356. reverse=True)
  357. elif reverse_sort:
  358. names.sort(key=lambda n: tuple(
  359. (table[n][k],) if k in table.get(n,{}) else ()
  360. for k in reverse_sort),
  361. reverse=False)
  362. # print header
  363. print('%-36s' % ('%s%s' % (
  364. ','.join(k for k in by),
  365. ' (%d added, %d removed)' % (
  366. sum(1 for n in table if n not in diff_table),
  367. sum(1 for n in diff_table if n not in table))
  368. if diff_results is not None and not percent else '')
  369. if not summary else ''),
  370. end='')
  371. if diff_results is None:
  372. print(' %s' % (
  373. ' '.join(k.rjust(len(types[k].none))
  374. for k in fields)))
  375. elif percent:
  376. print(' %s' % (
  377. ' '.join(k.rjust(len(types[k].diff_none))
  378. for k in fields)))
  379. else:
  380. print(' %s %s %s' % (
  381. ' '.join(('o'+k).rjust(len(types[k].diff_none))
  382. for k in fields),
  383. ' '.join(('n'+k).rjust(len(types[k].diff_none))
  384. for k in fields),
  385. ' '.join(('d'+k).rjust(len(types[k].diff_none))
  386. for k in fields)))
  387. # print entries
  388. if not summary:
  389. for name in names:
  390. r = table.get(name, {})
  391. if diff_results is not None:
  392. diff_r = diff_table.get(name, {})
  393. ratios = [types[k].ratio(r.get(k), diff_r.get(k))
  394. for k in fields]
  395. if not any(ratios) and not all_:
  396. continue
  397. print('%-36s' % ','.join(name), end='')
  398. if diff_results is None:
  399. print(' %s' % (
  400. ' '.join(r[k].table()
  401. if k in r else types[k].none
  402. for k in fields)))
  403. elif percent:
  404. print(' %s%s' % (
  405. ' '.join(r[k].diff_table()
  406. if k in r else types[k].diff_none
  407. for k in fields),
  408. ' (%s)' % ', '.join(
  409. '+∞%' if t == float('+inf')
  410. else '-∞%' if t == float('-inf')
  411. else '%+.1f%%' % (100*t)
  412. for t in ratios)))
  413. else:
  414. print(' %s %s %s%s' % (
  415. ' '.join(diff_r[k].diff_table()
  416. if k in diff_r else types[k].diff_none
  417. for k in fields),
  418. ' '.join(r[k].diff_table()
  419. if k in r else types[k].diff_none
  420. for k in fields),
  421. ' '.join(types[k].diff_diff(r.get(k), diff_r.get(k))
  422. if k in r or k in diff_r else types[k].diff_none
  423. for k in fields),
  424. ' (%s)' % ', '.join(
  425. '+∞%' if t == float('+inf')
  426. else '-∞%' if t == float('-inf')
  427. else '%+.1f%%' % (100*t)
  428. for t in ratios
  429. if t)
  430. if any(ratios) else ''))
  431. # print total
  432. total = fold(results, by=[], fields=fields, ops=ops)
  433. r = total[0] if total else {}
  434. if diff_results is not None:
  435. diff_total = fold(diff_results, by=[], fields=fields, ops=ops)
  436. diff_r = diff_total[0] if diff_total else {}
  437. ratios = [types[k].ratio(r.get(k), diff_r.get(k))
  438. for k in fields]
  439. print('%-36s' % 'TOTAL', end='')
  440. if diff_results is None:
  441. print(' %s' % (
  442. ' '.join(r[k].table()
  443. if k in r else types[k].none
  444. for k in fields)))
  445. elif percent:
  446. print(' %s%s' % (
  447. ' '.join(r[k].diff_table()
  448. if k in r else types[k].diff_none
  449. for k in fields),
  450. ' (%s)' % ', '.join(
  451. '+∞%' if t == float('+inf')
  452. else '-∞%' if t == float('-inf')
  453. else '%+.1f%%' % (100*t)
  454. for t in ratios)))
  455. else:
  456. print(' %s %s %s%s' % (
  457. ' '.join(diff_r[k].diff_table()
  458. if k in diff_r else types[k].diff_none
  459. for k in fields),
  460. ' '.join(r[k].diff_table()
  461. if k in r else types[k].diff_none
  462. for k in fields),
  463. ' '.join(types[k].diff_diff(r.get(k), diff_r.get(k))
  464. if k in r or k in diff_r else types[k].diff_none
  465. for k in fields),
  466. ' (%s)' % ', '.join(
  467. '+∞%' if t == float('+inf')
  468. else '-∞%' if t == float('-inf')
  469. else '%+.1f%%' % (100*t)
  470. for t in ratios
  471. if t)
  472. if any(ratios) else ''))
  473. def main(csv_paths, *,
  474. by=None,
  475. fields=None,
  476. define=[],
  477. **args):
  478. # separate out renames
  479. renames = [k.split('=', 1)
  480. for k in it.chain(by or [], fields or [])
  481. if '=' in k]
  482. if by is not None:
  483. by = [k.split('=', 1)[0] for k in by]
  484. if fields is not None:
  485. fields = [k.split('=', 1)[0] for k in fields]
  486. # figure out merge operations
  487. ops = {}
  488. for m in OPS.keys():
  489. for k in args.get(m, []):
  490. if k in ops:
  491. print("conflicting op for field %r?" % k)
  492. sys.exit(-1)
  493. ops[k] = m
  494. # rename ops?
  495. if renames:
  496. ops_ = {}
  497. for new_k, old_k in renames:
  498. if old_k in ops:
  499. ops_[new_k] = ops[old_k]
  500. ops.update(ops_)
  501. # find CSV files
  502. paths = []
  503. for path in csv_paths:
  504. if os.path.isdir(path):
  505. path = path + '/*.csv'
  506. for path in glob.glob(path):
  507. paths.append(path)
  508. if not paths:
  509. print('no .csv files found in %r?' % csv_paths)
  510. sys.exit(-1)
  511. results = []
  512. for path in paths:
  513. try:
  514. with openio(path) as f:
  515. reader = csv.DictReader(f, restval='')
  516. for r in reader:
  517. results.append(r)
  518. except FileNotFoundError:
  519. pass
  520. # homogenize
  521. by, fields, types, results = homogenize(results,
  522. by=by, fields=fields, renames=renames, define=define)
  523. # fold to remove duplicates
  524. results = fold(results,
  525. by=by, fields=fields, ops=ops)
  526. # write results to CSV
  527. if args.get('output'):
  528. with openio(args['output'], 'w') as f:
  529. writer = csv.DictWriter(f, by + fields)
  530. writer.writeheader()
  531. for r in results:
  532. writer.writerow(r)
  533. # find previous results?
  534. if args.get('diff'):
  535. diff_results = []
  536. try:
  537. with openio(args['diff']) as f:
  538. reader = csv.DictReader(f, restval='')
  539. for r in reader:
  540. diff_results.append(r)
  541. except FileNotFoundError:
  542. pass
  543. # homogenize
  544. _, _, _, diff_results = homogenize(diff_results,
  545. by=by, fields=fields, renames=renames, define=define, types=types)
  546. # fold to remove duplicates
  547. diff_results = fold(diff_results,
  548. by=by, fields=fields, ops=ops)
  549. # print table
  550. if not args.get('quiet'):
  551. table(
  552. results,
  553. diff_results if args.get('diff') else None,
  554. by=by,
  555. fields=fields,
  556. ops=ops,
  557. types=types,
  558. **args)
  559. if __name__ == "__main__":
  560. import argparse
  561. import sys
  562. parser = argparse.ArgumentParser(
  563. description="Summarize measurements in CSV files.")
  564. parser.add_argument(
  565. 'csv_paths',
  566. nargs='*',
  567. default=CSV_PATHS,
  568. help="Description of where to find *.csv files. May be a directory "
  569. "or list of paths. Defaults to %r." % CSV_PATHS)
  570. parser.add_argument(
  571. '-q', '--quiet',
  572. action='store_true',
  573. help="Don't show anything, useful with -o.")
  574. parser.add_argument(
  575. '-o', '--output',
  576. help="Specify CSV file to store results.")
  577. parser.add_argument(
  578. '-d', '--diff',
  579. help="Specify CSV file to diff against.")
  580. parser.add_argument(
  581. '-a', '--all',
  582. action='store_true',
  583. help="Show all, not just the ones that changed.")
  584. parser.add_argument(
  585. '-p', '--percent',
  586. action='store_true',
  587. help="Only show percentage change, not a full diff.")
  588. parser.add_argument(
  589. '-b', '--by',
  590. type=lambda x: [x.strip() for x in x.split(',')],
  591. help="Group by these fields. All other fields will be merged as "
  592. "needed. Can rename fields with new_name=old_name.")
  593. parser.add_argument(
  594. '-f', '--fields',
  595. type=lambda x: [x.strip() for x in x.split(',')],
  596. help="Use these fields. Can rename fields with new_name=old_name.")
  597. parser.add_argument(
  598. '-D', '--define',
  599. type=lambda x: (lambda k,v: (k, set(v.split(','))))(*x.split('=', 1)),
  600. action='append',
  601. help="Only include rows where this field is this value. May include "
  602. "comma-separated options.")
  603. parser.add_argument(
  604. '--add',
  605. type=lambda x: [x.strip() for x in x.split(',')],
  606. help="Add these fields (the default).")
  607. parser.add_argument(
  608. '--mul',
  609. type=lambda x: [x.strip() for x in x.split(',')],
  610. help="Multiply these fields.")
  611. parser.add_argument(
  612. '--min',
  613. type=lambda x: [x.strip() for x in x.split(',')],
  614. help="Take the minimum of these fields.")
  615. parser.add_argument(
  616. '--max',
  617. type=lambda x: [x.strip() for x in x.split(',')],
  618. help="Take the maximum of these fields.")
  619. parser.add_argument(
  620. '--avg',
  621. type=lambda x: [x.strip() for x in x.split(',')],
  622. help="Average these fields.")
  623. parser.add_argument(
  624. '-s', '--sort',
  625. type=lambda x: [x.strip() for x in x.split(',')],
  626. help="Sort by these fields.")
  627. parser.add_argument(
  628. '-S', '--reverse-sort',
  629. type=lambda x: [x.strip() for x in x.split(',')],
  630. help="Sort by these fields, but backwards.")
  631. parser.add_argument(
  632. '-Y', '--summary',
  633. action='store_true',
  634. help="Only show the totals.")
  635. sys.exit(main(**{k: v
  636. for k, v in vars(parser.parse_intermixed_args()).items()
  637. if v is not None}))