From 6e7199215409180ca6cdeb00da1aca52554564b6 Mon Sep 17 00:00:00 2001 From: Erik Johnson Date: Tue, 6 Feb 2018 19:52:18 -0600 Subject: [PATCH] Explicitly use utf-8 when decoding bytestrings While Python 3 defaults to utf-8 in `bytes.decode()`, Python 2's equivalent (`str.decode()`) will use the default encoding as set by site.py (which is almost always ascii). From looking at the code, it seems that these decodes have just sort of been fixed piecemeal (likely when someone realized that pygit2 was failing to handle unicode properly, but any decodes which run on Python 2 that don't specify utf-8 as the encoding are a ticking time bomb. I personally noticed this was a problem when I encountered a traceback in the RemoteCallbacks while fetching a new branch which contained utf-8 characters. During the fetch, when `pygit2.remote.maybe_string()` was invoked by `_update_tips_cb()` with a pointer to a bytestring containing unicode, the decode fails because the default encoding is ascii. As it turns out, this was fixed in master, but there are a number which still have no explicit encoding. This commit explicitly uses utf-8 for all remaining bytestring decodes which do not have an encoding specified, aside from one in PY3-specific code where doing so would be redundant. --- pygit2/blame.py | 2 +- pygit2/config.py | 2 +- pygit2/refspec.py | 8 ++++---- pygit2/remote.py | 2 +- pygit2/utils.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pygit2/blame.py b/pygit2/blame.py index 1d8728a1c..4b1ae4762 100644 --- a/pygit2/blame.py +++ b/pygit2/blame.py @@ -97,7 +97,7 @@ def orig_path(self): if not path: return None - return ffi.string(path).decode() + return ffi.string(path).decode('utf-8') class Blame(object): diff --git a/pygit2/config.py b/pygit2/config.py index 8933675d0..2c94ea7d9 100644 --- a/pygit2/config.py +++ b/pygit2/config.py @@ -263,7 +263,7 @@ def _from_found_config(fn): buf = ffi.new('git_buf *', (ffi.NULL, 0)) err = fn(buf) check_error(err, True) - cpath = ffi.string(buf.ptr).decode() + cpath = ffi.string(buf.ptr).decode('utf-8') C.git_buf_free(buf) return Config(cpath) diff --git a/pygit2/refspec.py b/pygit2/refspec.py index 08954a5c0..1d83c1d60 100644 --- a/pygit2/refspec.py +++ b/pygit2/refspec.py @@ -43,12 +43,12 @@ def __init__(self, owner, ptr): @property def src(self): """Source or lhs of the refspec""" - return ffi.string(C.git_refspec_src(self._refspec)).decode() + return ffi.string(C.git_refspec_src(self._refspec)).decode('utf-8') @property def dst(self): """Destinaton or rhs of the refspec""" - return ffi.string(C.git_refspec_dst(self._refspec)).decode() + return ffi.string(C.git_refspec_dst(self._refspec)).decode('utf-8') @property def force(self): @@ -58,7 +58,7 @@ def force(self): @property def string(self): """String which was used to create this refspec""" - return ffi.string(C.git_refspec_string(self._refspec)).decode() + return ffi.string(C.git_refspec_string(self._refspec)).decode('utf-8') @property def direction(self): @@ -82,7 +82,7 @@ def _transform(self, ref, fn): check_error(err) try: - return ffi.string(buf.ptr).decode() + return ffi.string(buf.ptr).decode('utf-8') finally: C.git_buf_free(buf) diff --git a/pygit2/remote.py b/pygit2/remote.py index 7a0702066..47e5c7ed3 100644 --- a/pygit2/remote.py +++ b/pygit2/remote.py @@ -238,7 +238,7 @@ def _sideband_progress_cb(string, length, data): return 0 try: - s = ffi.string(string, length).decode() + s = ffi.string(string, length).decode('utf-8') progress(s) except Exception as e: self._stored_exception = e diff --git a/pygit2/utils.py b/pygit2/utils.py index 4821011ac..2497da5d9 100644 --- a/pygit2/utils.py +++ b/pygit2/utils.py @@ -45,7 +45,7 @@ def strarray_to_strings(arr): l = [None] * arr.count for i in range(arr.count): - l[i] = ffi.string(arr.strings[i]).decode() + l[i] = ffi.string(arr.strings[i]).decode('utf-8') return l