Skip to content

Instantly share code, notes, and snippets.

@juice500ml
Last active June 28, 2024 09:50
Show Gist options
  • Save juice500ml/b8584dd6b57a927c0e8a26bff65b6c78 to your computer and use it in GitHub Desktop.
Save juice500ml/b8584dd6b57a927c0e8a26bff65b6c78 to your computer and use it in GitHub Desktop.
Counting files
from collections import defaultdict
import pkgutil
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from cycler import cycler
mpl.rcParams["axes.prop_cycle"] = cycler(color=["#648fff", "#785ef0", "#dc267f", "#fe6100", "#FFB000", "#648fff", "#dc267f", "#000000"])
def _size_count(fname):
p = Path(fname)
if p.exists():
with open(fname) as f:
return p.suffix, len(f.readlines())
else:
return p.suffix, 0
def _recursive_python_count(fname, cache):
if fname in cache:
return 0, 0
else:
cache.add(fname)
line_count, file_count = 0, 0
with open(fname) as f:
lines = f.readlines()
line_count += len(lines)
file_count += 1
for line in lines:
if "import" in line and ("espnet." in line or "espnet2." in line or "espnetez." in line):
line = line.strip()
if line[:4] == "from":
pkg = line.split()[1]
lc, fc = _recursive_python_count(pkgutil.get_loader(pkg).get_filename(), cache)
line_count += lc
file_count += fc
return line_count, file_count
def _recursive_file_count(dirname, cache):
dirname = Path(dirname)
assert dirname.is_dir()
for f in dirname.rglob("*"):
f = Path(f).resolve()
if f.is_dir():
for t, c in _recursive_file_count(f, cache):
yield t, c
else:
if str(f) not in cache:
cache.add(str(f))
yield _size_count(str(f))
def count_files(files, recursive=False):
line_counts = defaultdict(int)
file_counts = defaultdict(int)
cache = set()
for f in files:
p = Path(f).resolve()
if p.is_dir():
lc, fc = count_files(list(p.rglob("*")), recursive=True)
for k, v in lc.items():
line_counts[k] += v
for k, v in fc.items():
file_counts[k] += v
else:
ty, cnt = _size_count(f)
if recursive and ty == ".py":
lc, fc = _recursive_python_count(f, cache)
line_counts[".py"] += lc
file_counts[".py"] += fc
else:
line_counts[ty] += cnt
file_counts[ty] += 1
return line_counts, file_counts
def plot_bar(ez, esp, ylabel, title, fname):
mpl.rcParams["axes.prop_cycle"] = cycler(color=["#648fff", "#785ef0", "#dc267f", "#fe6100", "#FFB000", "#648fff", "#dc267f", "#000000"])
ext = {".py": "Python", ".sh": "Bash", ".yaml": "Config", ".pl": "Perl", }
ez[".yaml"] += ez[".conf"]
esp[".yaml"] += esp[".conf"]
keys = (set(ez.keys()) | set(esp.keys())) - {".md", ".conf"}
both = {k: ez[k] + esp[k] for k in keys}
both = sorted(both.items(), key=lambda x: -x[1])
bottom = [0, 0]
fig, ax = plt.subplots(figsize=(2., 3.5))
for k, _ in both:
bar = ax.bar(["ESPnet", "ESPnet-EZ"], [esp[k], ez[k]], width=0.4, label=ext[k], bottom=bottom)
bottom[0] += esp[k]
bottom[1] += ez[k]
if both[-1][0] == k:
if sum(bottom) > 10000:
labels = [f"{b//1000}k" for b in bottom]
else:
labels = bottom
ax.bar_label(bar, labels=labels)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[::-1], labels[::-1], title="Language", loc="upper right", frameon=False)
ax.set_ylabel(ylabel)
ax.set_title(title)
ax.set_ylim(0, int(max(bottom) * 1.1))
if sum(bottom) > 10000:
ax.set_yticks([0, 50000, 100000, 150000, 200000, 250000], labels="0,50k,100k,150k,200k,250k".split(","))
fig.savefig(fname, bbox_inches="tight")
# Minimum number of files
# espnetez
ez_lines, _ = count_files([
"egs2/librispeech_100/ez1/conf/owsm_finetune_base.yaml",
"egs2/librispeech_100/ez1/main.py",
# "egs2/librispeech_100/ez1/train.sh",
], False)
# espnet
orig_lines, _ = count_files([
"egs2/librispeech_100/s2t1/conf/tuning/train_s2t_ebf_lr1e-3_warmup5k.yaml",
"egs2/librispeech_100/s2t1/conf/decode_s2t.yaml",
"egs2/librispeech_100/s2t1/local/data_prep.py",
"egs2/librispeech_100/s2t1/local/data.sh",
# "egs2/librispeech_100/s2t1/run.sh",
], False)
plot_bar(ez_lines, orig_lines, "# of lines", "Newly written code", "new_lines.pdf")
# Relevant files
ez_lines, ez_files = count_files([
"egs2/librispeech_100/ez1/conf/owsm_finetune_base.yaml",
"egs2/librispeech_100/ez1/main.py",
# "egs2/librispeech_100/ez1/train.sh",
], True)
orig_lines, orig_files = count_files([
"egs2/librispeech_100/s2t1/conf/tuning/train_s2t_ebf_lr1e-3_warmup5k.yaml",
"egs2/librispeech_100/s2t1/conf/decode_s2t.yaml",
"egs2/librispeech_100/s2t1/local/data_prep.py",
"egs2/librispeech_100/s2t1/local/data.sh",
# "egs2/librispeech_100/s2t1/run.sh",
"egs2/librispeech_100/s2t1",
"espnet2/bin/s2st_train.py",
"espnet2/bin/aggregate_stats_dirs.py",
"espnet2/bin/split_scps.py",
"espnet2/bin/launch.py",
], True)
plot_bar(ez_lines, orig_lines, "# of lines", "Dependent code", "all_lines.pdf")
plot_bar(ez_files, orig_files, "# of files", "Dependent code", "all_files.pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment