Cache some (possibly) expensive function calls

This patch memoizes some of the functions to help speed up execution.
The speedup is quite variable, but ~30% is normal when generating a
medium size repository, and the output is byte-for-byte identical.
This commit is contained in:
Alberto Bertogli 2022-08-31 21:48:45 +01:00
parent 15547b2796
commit 518188288e
3 changed files with 67 additions and 41 deletions

55
git.py

@ -6,6 +6,7 @@ command line tool directly, so please be careful with using untrusted
parameters. parameters.
""" """
import functools
import sys import sys
import io import io
import subprocess import subprocess
@ -199,7 +200,8 @@ class Repo:
"""Returns a GitCommand() on our path.""" """Returns a GitCommand() on our path."""
return GitCommand(self.path, cmd) return GitCommand(self.path, cmd)
def for_each_ref(self, pattern=None, sort=None, count=None): @functools.lru_cache
def _for_each_ref(self, pattern=None, sort=None, count=None):
"""Returns a list of references.""" """Returns a list of references."""
cmd = self.cmd("for-each-ref") cmd = self.cmd("for-each-ref")
if sort: if sort:
@ -209,26 +211,25 @@ class Repo:
if pattern: if pattern:
cmd.arg(pattern) cmd.arg(pattern)
refs = []
for l in cmd.run(): for l in cmd.run():
obj_id, obj_type, ref = l.split() obj_id, obj_type, ref = l.split()
yield obj_id, obj_type, ref refs.append((obj_id, obj_type, ref))
return refs
def branches(self, sort="-authordate"):
"""Get the (name, obj_id) of the branches."""
refs = self.for_each_ref(pattern="refs/heads/", sort=sort)
for obj_id, _, ref in refs:
yield ref[len("refs/heads/") :], obj_id
@functools.cache
def branch_names(self): def branch_names(self):
"""Get the names of the branches.""" """Get the names of the branches."""
return (name for name, _ in self.branches()) refs = self._for_each_ref(pattern="refs/heads/", sort="-authordate")
return [ref[len("refs/heads/") :] for _, _, ref in refs]
@functools.cache
def tags(self, sort="-taggerdate"): def tags(self, sort="-taggerdate"):
"""Get the (name, obj_id) of the tags.""" """Get the (name, obj_id) of the tags."""
refs = self.for_each_ref(pattern="refs/tags/", sort=sort) refs = self._for_each_ref(pattern="refs/tags/", sort=sort)
for obj_id, _, ref in refs: return [(ref[len("refs/tags/") :], obj_id) for obj_id, _, ref in refs]
yield ref[len("refs/tags/") :], obj_id
@functools.lru_cache
def commit_ids(self, ref, limit=None): def commit_ids(self, ref, limit=None):
"""Generate commit ids.""" """Generate commit ids."""
cmd = self.cmd("rev-list") cmd = self.cmd("rev-list")
@ -238,9 +239,9 @@ class Repo:
cmd.arg(ref) cmd.arg(ref)
cmd.arg("--") cmd.arg("--")
for l in cmd.run(): return [l.rstrip("\n") for l in cmd.run()]
yield l.rstrip("\n")
@functools.lru_cache
def commit(self, commit_id): def commit(self, commit_id):
"""Return a single commit.""" """Return a single commit."""
cs = list(self.commits(commit_id, limit=1)) cs = list(self.commits(commit_id, limit=1))
@ -248,10 +249,10 @@ class Repo:
return None return None
return cs[0] return cs[0]
def commits(self, ref, limit=None, offset=0): @functools.lru_cache
def commits(self, ref, limit, offset=0):
"""Generate commit objects for the ref.""" """Generate commit objects for the ref."""
cmd = self.cmd("rev-list") cmd = self.cmd("rev-list")
if limit:
cmd.max_count = limit + offset cmd.max_count = limit + offset
cmd.header = None cmd.header = None
@ -261,6 +262,7 @@ class Repo:
info_buffer = "" info_buffer = ""
count = 0 count = 0
commits = []
for l in cmd.run(): for l in cmd.run():
if "\0" in l: if "\0" in l:
pre, post = l.split("\0", 1) pre, post = l.split("\0", 1)
@ -268,7 +270,7 @@ class Repo:
count += 1 count += 1
if count > offset: if count > offset:
yield Commit.from_str(self, info_buffer) commits.append(Commit.from_str(self, info_buffer))
# Start over. # Start over.
info_buffer = post info_buffer = post
@ -278,8 +280,11 @@ class Repo:
if info_buffer: if info_buffer:
count += 1 count += 1
if count > offset: if count > offset:
yield Commit.from_str(self, info_buffer) commits.append(Commit.from_str(self, info_buffer))
return commits
@functools.lru_cache
def diff(self, ref): def diff(self, ref):
"""Return a Diff object for the ref.""" """Return a Diff object for the ref."""
cmd = self.cmd("diff-tree") cmd = self.cmd("diff-tree")
@ -295,6 +300,7 @@ class Repo:
return Diff.from_str(cmd.run()) return Diff.from_str(cmd.run())
@functools.lru_cache
def refs(self): def refs(self):
"""Return a dict of obj_id -> ref.""" """Return a dict of obj_id -> ref."""
cmd = self.cmd("show-ref") cmd = self.cmd("show-ref")
@ -308,10 +314,12 @@ class Repo:
return r return r
@functools.lru_cache
def tree(self, ref): def tree(self, ref):
"""Returns a Tree instance for the given ref.""" """Returns a Tree instance for the given ref."""
return Tree(self, ref) return Tree(self, ref)
@functools.lru_cache
def blob(self, path, ref): def blob(self, path, ref):
"""Returns a Blob instance for the given path.""" """Returns a Blob instance for the given path."""
cmd = self.cmd("cat-file") cmd = self.cmd("cat-file")
@ -329,9 +337,10 @@ class Repo:
return Blob(out.read()[: int(head)]) return Blob(out.read()[: int(head)])
@functools.cache
def last_commit_timestamp(self): def last_commit_timestamp(self):
"""Return the timestamp of the last commit.""" """Return the timestamp of the last commit."""
refs = self.for_each_ref( refs = self._for_each_ref(
pattern="refs/heads/", sort="-committerdate", count=1 pattern="refs/heads/", sort="-committerdate", count=1
) )
for obj_id, _, _ in refs: for obj_id, _, _ in refs:
@ -515,12 +524,13 @@ class Diff:
class Tree: class Tree:
""" A git tree.""" """A git tree."""
def __init__(self, repo: Repo, ref: str): def __init__(self, repo: Repo, ref: str):
self.repo = repo self.repo = repo
self.ref = ref self.ref = ref
@functools.lru_cache
def ls( def ls(
self, path, recursive=False self, path, recursive=False
) -> Iterable[Tuple[str, smstr, Optional[int]]]: ) -> Iterable[Tuple[str, smstr, Optional[int]]]:
@ -537,6 +547,7 @@ class Tree:
else: else:
cmd.arg(path) cmd.arg(path)
files = []
for l in cmd.run(): for l in cmd.run():
_mode, otype, _oid, size, name = l.split(None, 4) _mode, otype, _oid, size, name = l.split(None, 4)
if size == "-": if size == "-":
@ -553,7 +564,9 @@ class Tree:
# We use a smart string for the name, as it's often tricky to # We use a smart string for the name, as it's often tricky to
# manipulate otherwise. # manipulate otherwise.
yield otype, smstr(name), size files.append((otype, smstr(name), size))
return files
class Blob: class Blob:

@ -9,6 +9,14 @@ try:
from pygments import highlight # type: ignore from pygments import highlight # type: ignore
from pygments import lexers # type: ignore from pygments import lexers # type: ignore
from pygments.formatters import HtmlFormatter # type: ignore from pygments.formatters import HtmlFormatter # type: ignore
_html_formatter = HtmlFormatter(
encoding="utf-8",
cssclass="source_code",
linenos="table",
anchorlinenos=True,
lineanchors="line",
)
except ImportError: except ImportError:
pygments = None pygments = None
@ -19,6 +27,7 @@ except ImportError:
markdown = None markdown = None
import base64 import base64
import functools
import mimetypes import mimetypes
import string import string
import os.path import os.path
@ -32,6 +41,7 @@ def shorten(s: str, width=60):
return s[:57] + "..." return s[:57] + "..."
@functools.lru_cache
def can_colorize(s: str): def can_colorize(s: str):
"""True if we can colorize the string, False otherwise.""" """True if we can colorize the string, False otherwise."""
if pygments is None: if pygments is None:
@ -77,6 +87,7 @@ def can_embed_image(repo, fname):
) )
@functools.lru_cache
def colorize_diff(s: str) -> str: def colorize_diff(s: str) -> str:
lexer = lexers.DiffLexer(encoding="utf-8") lexer = lexers.DiffLexer(encoding="utf-8")
formatter = HtmlFormatter(encoding="utf-8", cssclass="source_code") formatter = HtmlFormatter(encoding="utf-8", cssclass="source_code")
@ -84,6 +95,7 @@ def colorize_diff(s: str) -> str:
return highlight(s, lexer, formatter) return highlight(s, lexer, formatter)
@functools.lru_cache
def colorize_blob(fname, s: str) -> str: def colorize_blob(fname, s: str) -> str:
try: try:
lexer = lexers.guess_lexer_for_filename(fname, s, encoding="utf-8") lexer = lexers.guess_lexer_for_filename(fname, s, encoding="utf-8")
@ -98,24 +110,7 @@ def colorize_blob(fname, s: str) -> str:
except lexers.ClassNotFound: except lexers.ClassNotFound:
pass pass
formatter = HtmlFormatter( return highlight(s, lexer, _html_formatter)
encoding="utf-8",
cssclass="source_code",
linenos="table",
anchorlinenos=True,
lineanchors="line",
)
return highlight(s, lexer, formatter)
def markdown_blob(s: str) -> str:
extensions = [
"markdown.extensions.fenced_code",
"markdown.extensions.tables",
RewriteLocalLinksExtension(),
]
return markdown.markdown(s, extensions=extensions)
def embed_image_blob(fname: str, image_data: bytes) -> str: def embed_image_blob(fname: str, image_data: bytes) -> str:
@ -126,11 +121,13 @@ def embed_image_blob(fname: str, image_data: bytes) -> str:
) )
@functools.lru_cache
def is_binary(b: bytes): def is_binary(b: bytes):
# Git considers a blob binary if NUL in first ~8KB, so do the same. # Git considers a blob binary if NUL in first ~8KB, so do the same.
return b"\0" in b[:8192] return b"\0" in b[:8192]
@functools.lru_cache
def hexdump(s: bytes): def hexdump(s: bytes):
graph = string.ascii_letters + string.digits + string.punctuation + " " graph = string.ascii_letters + string.digits + string.punctuation + " "
b = s.decode("latin1") b = s.decode("latin1")
@ -181,3 +178,19 @@ if markdown:
md.treeprocessors.register( md.treeprocessors.register(
RewriteLocalLinks(), "RewriteLocalLinks", 1000 RewriteLocalLinks(), "RewriteLocalLinks", 1000
) )
_md_extensions = [
"markdown.extensions.fenced_code",
"markdown.extensions.tables",
RewriteLocalLinksExtension(),
]
@functools.lru_cache
def markdown_blob(s: str) -> str:
return markdown.markdown(s, extensions=_md_extensions)
else:
@functools.lru_cache
def markdown_blob(s: str) -> str:
raise RuntimeError("markdown_blob() called without markdown support")