Move to Python 3

Python 3 was released more than 10 years ago, and support for Python 2
is going away, with many Linux distributions starting to phase it out.

This patch migrates git-arr to Python 3.

The generated output is almost exactly the same, there are some minor
differences such as HTML characters being quoted more aggresively, and
handling of paths with non-utf8 values.
This commit is contained in:
Alberto Bertogli 2020-05-24 02:36:43 +01:00
parent cbb36e087c
commit 1183d6f817
5 changed files with 56 additions and 66 deletions

3
.gitignore vendored

@ -1,3 +1,4 @@
*.pyc *.pyc
__pycache__ __pycache__
.*.swp .*
!.gitignore

24
git-arr

@ -1,21 +1,15 @@
#!/usr/bin/env python #!/usr/bin/env python3
""" """
git-arr: A git web html generator. git-arr: A git web html generator.
""" """
from __future__ import print_function import configparser
import math import math
import optparse import optparse
import os import os
import re import re
import sys import sys
try:
import configparser
except ImportError:
import ConfigParser as configparser
import bottle import bottle
import git import git
@ -64,7 +58,7 @@ def load_config(path):
'generate_patch': 'yes', 'generate_patch': 'yes',
} }
config = configparser.SafeConfigParser(defaults) config = configparser.ConfigParser(defaults)
config.read(path) config.read(path)
# Do a first pass for general sanity checking and recursive expansion. # Do a first pass for general sanity checking and recursive expansion.
@ -118,7 +112,7 @@ def load_config(path):
r.info.commits_per_page = config.getint(s, 'commits_per_page') r.info.commits_per_page = config.getint(s, 'commits_per_page')
r.info.max_pages = config.getint(s, 'max_pages') r.info.max_pages = config.getint(s, 'max_pages')
if r.info.max_pages <= 0: if r.info.max_pages <= 0:
r.info.max_pages = sys.maxint r.info.max_pages = sys.maxsize
r.info.generate_tree = config.getboolean(s, 'tree') r.info.generate_tree = config.getboolean(s, 'tree')
r.info.root_diff = config.getboolean(s, 'rootdiff') r.info.root_diff = config.getboolean(s, 'rootdiff')
r.info.generate_patch = config.getboolean(s, 'generate_patch') r.info.generate_patch = config.getboolean(s, 'generate_patch')
@ -263,6 +257,10 @@ def blob(repo, bname, fname, dirname = ''):
fname = git.smstr.from_url(fname) fname = git.smstr.from_url(fname)
path = dirname.raw + fname.raw path = dirname.raw + fname.raw
# Handle backslash-escaped characters, which are not utf8.
# This matches the generated links from git.unquote().
path = path.encode("utf8").decode("unicode-escape").encode("latin1")
content = repo.blob(path, bname) content = repo.blob(path, bname)
if content is None: if content is None:
bottle.abort(404, "File %r not found in branch %s" % (path, bname)) bottle.abort(404, "File %r not found in branch %s" % (path, bname))
@ -339,7 +337,7 @@ def generate(output, only = None):
else: else:
# Otherwise, be lazy if we were given a function to run, or write # Otherwise, be lazy if we were given a function to run, or write
# always if they gave us a string. # always if they gave us a string.
if isinstance(func_or_str, (str, unicode)): if isinstance(func_or_str, str):
print(path) print(path)
s = func_or_str s = func_or_str
else: else:
@ -348,7 +346,7 @@ def generate(output, only = None):
print(path) print(path)
s = func_or_str(*args) s = func_or_str(*args)
open(path, 'w').write(s.encode('utf8', errors = 'xmlcharrefreplace')) open(path, 'w').write(s)
if mtime: if mtime:
os.utime(path, (mtime, mtime)) os.utime(path, (mtime, mtime))
@ -398,7 +396,7 @@ def generate(output, only = None):
write_to('static/syntax.css', read_f, [static_path + '/syntax.css'], write_to('static/syntax.css', read_f, [static_path + '/syntax.css'],
os.stat(static_path + '/syntax.css').st_mtime) os.stat(static_path + '/syntax.css').st_mtime)
rs = sorted(repos.values(), key = lambda r: r.name) rs = sorted(list(repos.values()), key = lambda r: r.name)
if only: if only:
rs = [r for r in rs if r.name in only] rs = [r for r in rs if r.name in only]

87
git.py

@ -12,35 +12,13 @@ import subprocess
from collections import defaultdict from collections import defaultdict
import email.utils import email.utils
import datetime import datetime
import urllib import urllib.request, urllib.parse, urllib.error
from cgi import escape from html import escape
# Path to the git binary. # Path to the git binary.
GIT_BIN = "git" GIT_BIN = "git"
class EncodeWrapper:
"""File-like wrapper that returns data utf8 encoded."""
def __init__(self, fd, encoding = 'utf8', errors = 'replace'):
self.fd = fd
self.encoding = encoding
self.errors = errors
def __iter__(self):
for line in self.fd:
yield line.decode(self.encoding, errors = self.errors)
def read(self):
"""Returns the whole content."""
s = self.fd.read()
return s.decode(self.encoding, errors = self.errors)
def readline(self):
"""Returns a single line."""
s = self.fd.readline()
return s.decode(self.encoding, errors = self.errors)
def run_git(repo_path, params, stdin = None, silent_stderr = False, raw = False): def run_git(repo_path, params, stdin = None, silent_stderr = False, raw = False):
"""Invokes git with the given parameters. """Invokes git with the given parameters.
@ -66,13 +44,8 @@ def run_git(repo_path, params, stdin = None, silent_stderr = False, raw = False)
if raw: if raw:
return p.stdout return p.stdout
# We need to wrap stdout if we want to decode it as utf8, subprocess return io.TextIOWrapper(p.stdout, encoding = 'utf8',
# doesn't support us telling it the encoding. errors = 'backslashreplace')
if sys.version_info.major == 3:
return io.TextIOWrapper(p.stdout, encoding = 'utf8',
errors = 'replace')
else:
return EncodeWrapper(p.stdout)
class GitCommand (object): class GitCommand (object):
@ -109,6 +82,8 @@ class GitCommand (object):
def stdin(self, s): def stdin(self, s):
"""Sets the contents we will send in stdin.""" """Sets the contents we will send in stdin."""
self._override = True self._override = True
if isinstance(s, str):
s = s.encode("utf8")
self._stdin_buf = s self._stdin_buf = s
self._override = False self._override = False
@ -116,7 +91,7 @@ class GitCommand (object):
"""Runs the git command.""" """Runs the git command."""
params = [self._cmd] params = [self._cmd]
for k, v in self._kwargs.items(): for k, v in list(self._kwargs.items()):
dash = '--' if len(k) > 1 else '-' dash = '--' if len(k) > 1 else '-'
if v is None: if v is None:
params.append('%s%s' % (dash, k)) params.append('%s%s' % (dash, k))
@ -146,11 +121,16 @@ class smstr:
.html -> an HTML-embeddable representation. .html -> an HTML-embeddable representation.
""" """
def __init__(self, raw): def __init__(self, raw):
if not isinstance(raw, str): if not isinstance(raw, (str, bytes)):
raise TypeError("The raw string must be instance of 'str'") raise TypeError(
"The raw string must be instance of 'str', not %s" %
type(raw))
self.raw = raw self.raw = raw
self.unicode = raw.decode('utf8', errors = 'replace') if isinstance(raw, bytes):
self.url = urllib.pathname2url(raw) self.unicode = raw.decode('utf8', errors = 'backslashreplace')
else:
self.unicode = raw
self.url = urllib.request.pathname2url(raw)
self.html = self._to_html() self.html = self._to_html()
def __cmp__(self, other): def __cmp__(self, other):
@ -163,7 +143,7 @@ class smstr:
@staticmethod @staticmethod
def from_url(url): def from_url(url):
"""Returns an smstr() instance from an url-encoded string.""" """Returns an smstr() instance from an url-encoded string."""
return smstr(urllib.url2pathname(url)) return smstr(urllib.request.url2pathname(url))
def split(self, sep): def split(self, sep):
"""Like str.split().""" """Like str.split()."""
@ -176,10 +156,10 @@ class smstr:
def _to_html(self): def _to_html(self):
"""Returns an html representation of the unicode string.""" """Returns an html representation of the unicode string."""
html = u'' html = ''
for c in escape(self.unicode): for c in escape(self.unicode):
if c in '\t\r\n\r\f\a\b\v\0': if c in '\t\r\n\r\f\a\b\v\0':
esc_c = c.encode('ascii').encode('string_escape') esc_c = c.encode("unicode-escape").decode("utf8")
html += '<span class="ctrlchr">%s</span>' % esc_c html += '<span class="ctrlchr">%s</span>' % esc_c
else: else:
html += c html += c
@ -190,14 +170,23 @@ class smstr:
def unquote(s): def unquote(s):
"""Git can return quoted file names, unquote them. Always return a str.""" """Git can return quoted file names, unquote them. Always return a str."""
if not (s[0] == '"' and s[-1] == '"'): if not (s[0] == '"' and s[-1] == '"'):
# Unquoted strings are always safe, no need to mess with them; just # Unquoted strings are always safe, no need to mess with them
# make sure we return str.
s = s.encode('ascii')
return s return s
# Get rid of the quotes, we never want them in the output, and convert to # The string will be of the form `"<escaped>"`, where <escaped> is a
# a raw string, un-escaping the backslashes. # backslash-escaped representation of the name of the file.
s = s[1:-1].decode('string-escape') # Examples: "with\ttwo\ttabs" , "\303\261aca-utf8", "\361aca-latin1"
# Get rid of the quotes, we never want them in the output.
s = s[1:-1]
# Un-escape the backslashes.
# latin1 is ok to use here because in Python it just maps the code points
# 0-255 to the bytes 0x-0xff, which is what we expect.
s = s.encode("latin1").decode("unicode-escape")
# Convert to utf8.
s = s.encode("latin1").decode("utf8", errors='backslashreplace')
return s return s
@ -337,13 +326,13 @@ class Repo:
cmd.raw(True) cmd.raw(True)
cmd.batch = '%(objectsize)' cmd.batch = '%(objectsize)'
if isinstance(ref, unicode): # Format: <ref>:<path>
ref = ref.encode('utf8') # Construct it in binary since the path might not be utf8.
cmd.stdin('%s:%s' % (ref, path)) cmd.stdin(ref.encode("utf8") + b":" + path)
out = cmd.run() out = cmd.run()
head = out.readline() head = out.readline()
if not head or head.strip().endswith('missing'): if not head or head.strip().endswith(b'missing'):
return None return None
return Blob(out.read()[:int(head)]) return Blob(out.read()[:int(head)])

@ -108,15 +108,17 @@ def markdown_blob(s):
def embed_image_blob(fname, image_data): def embed_image_blob(fname, image_data):
mimetype = mimetypes.guess_type(fname)[0] mimetype = mimetypes.guess_type(fname)[0]
b64img = base64.b64encode(image_data).decode("ascii")
return '<img style="max-width:100%;" src="data:{0};base64,{1}" />'.format( \ return '<img style="max-width:100%;" src="data:{0};base64,{1}" />'.format( \
mimetype, base64.b64encode(image_data)) mimetype, b64img)
def is_binary(s): def is_binary(s):
# Git considers a blob binary if NUL in first ~8KB, so do the same. # Git considers a blob binary if NUL in first ~8KB, so do the same.
return '\0' in s[:8192] return b'\0' in s[:8192]
def hexdump(s): def hexdump(s):
graph = string.ascii_letters + string.digits + string.punctuation + ' ' graph = string.ascii_letters + string.digits + string.punctuation + ' '
s = s.decode("latin1")
offset = 0 offset = 0
while s: while s:
t = s[:16] t = s[:16]

@ -1,5 +1,5 @@
<table class="nice toggable ls" id="ls"> <table class="nice toggable ls" id="ls">
% key_func = lambda (t, n, s): (t != 'tree', n.raw) % key_func = lambda x: (x[0] != 'tree', x[1].raw)
% for type, name, size in sorted(tree.ls(dirname.raw), key = key_func): % for type, name, size in sorted(tree.ls(dirname.raw), key = key_func):
<tr class="{{type}}"> <tr class="{{type}}">
% if type == "blob": % if type == "blob":