Counting files
from collections import defaultdict
import pkgutil
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from cycler import cycler
mpl.rcParams["axes.prop_cycle"] = cycler(color=["#648fff", "#785ef0", "#dc267f", "#fe6100", "#FFB000", "#648fff", "#dc267f", "#000000"])
def _size_count(fname):
p = Path(fname)
if p.exists():
with open(fname) as f:
return p.suffix, len(f.readlines())
return p.suffix, 0
def _recursive_python_count(fname, cache):
if fname in cache:
return 0, 0
line_count, file_count = 0, 0
with open(fname) as f:
lines = f.readlines()
line_count += len(lines)
file_count += 1
for line in lines:
if "import" in line and ("espnet." in line or "espnet2." in line or "espnetez." in line):
line = line.strip()
if line[:4] == "from":
pkg = line.split()[1]
lc, fc = _recursive_python_count(pkgutil.get_loader(pkg).get_filename(), cache)
line_count += lc
file_count += fc
return line_count, file_count
def _recursive_file_count(dirname, cache):
dirname = Path(dirname)
assert dirname.is_dir()
for f in dirname.rglob("*"):
f = Path(f).resolve()
if f.is_dir():
for t, c in _recursive_file_count(f, cache):
yield t, c
if str(f) not in cache:
yield _size_count(str(f))
def count_files(files, recursive=False):
line_counts = defaultdict(int)
file_counts = defaultdict(int)
cache = set()
for f in files:
p = Path(f).resolve()
if p.is_dir():
lc, fc = count_files(list(p.rglob("*")), recursive=True)
for k, v in lc.items():
line_counts[k] += v
for k, v in fc.items():
file_counts[k] += v
ty, cnt = _size_count(f)
if recursive and ty == ".py":
lc, fc = _recursive_python_count(f, cache)
line_counts[".py"] += lc
file_counts[".py"] += fc
line_counts[ty] += cnt
file_counts[ty] += 1
return line_counts, file_counts
def plot_bar(ez, esp, ylabel, title, fname):
mpl.rcParams["axes.prop_cycle"] = cycler(color=["#648fff", "#785ef0", "#dc267f", "#fe6100", "#FFB000", "#648fff", "#dc267f", "#000000"])
ext = {".py": "Python", ".sh": "Bash", ".yaml": "Config", ".pl": "Perl", }
ez[".yaml"] += ez[".conf"]
esp[".yaml"] += esp[".conf"]
keys = (set(ez.keys()) | set(esp.keys())) - {".md", ".conf"}
both = {k: ez[k] + esp[k] for k in keys}
both = sorted(both.items(), key=lambda x: -x[1])
bottom = [0, 0]
fig, ax = plt.subplots(figsize=(2., 3.5))
for k, _ in both:
bar =["ESPnet", "ESPnet-EZ"], [esp[k], ez[k]], width=0.4, label=ext[k], bottom=bottom)
bottom[0] += esp[k]
bottom[1] += ez[k]
if both[-1][0] == k:
if sum(bottom) > 10000:
labels = [f"{b//1000}k" for b in bottom]
labels = bottom
ax.bar_label(bar, labels=labels)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], title="Language", loc="upper right", frameon=False)
ax.set_ylim(0, int(max(bottom) * 1.1))
if sum(bottom) > 10000:
ax.set_yticks([0, 50000, 100000, 150000, 200000, 250000], labels="0,50k,100k,150k,200k,250k".split(","))
fig.savefig(fname, bbox_inches="tight")
# Minimum number of files
# espnetez
ez_lines, _ = count_files([
# "egs2/librispeech_100/ez1/",
], False)
# espnet
orig_lines, _ = count_files([
# "egs2/librispeech_100/s2t1/",
], False)
plot_bar(ez_lines, orig_lines, "# of lines", "Newly written code", "new_lines.pdf")
# Relevant files
ez_lines, ez_files = count_files([
# "egs2/librispeech_100/ez1/",
], True)
orig_lines, orig_files = count_files([
# "egs2/librispeech_100/s2t1/",
], True)
plot_bar(ez_lines, orig_lines, "# of lines", "Dependent code", "all_lines.pdf")
plot_bar(ez_files, orig_files, "# of files", "Dependent code", "all_files.pdf")
