Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Lib/test/test_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ def test_particularly_evil_undecodable(self):
with open(fn, "wb") as fp:
fp.write(src)
res = script_helper.run_python_until_end(fn)[0]
self.assertIn(b"Non-UTF-8", res.err)
self.assertIn(b"SyntaxError", res.err)

def test_yet_more_evil_still_undecodable(self):
# Issue #25388
Expand All @@ -528,7 +528,7 @@ def test_yet_more_evil_still_undecodable(self):
with open(fn, "wb") as fp:
fp.write(src)
res = script_helper.run_python_until_end(fn)[0]
self.assertIn(b"Non-UTF-8", res.err)
self.assertIn(b"", res.err)

@support.cpython_only
def test_compiler_recursion_limit(self):
Expand Down
14 changes: 14 additions & 0 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,20 @@ def test_20731(self):
self.assertEqual(sub.returncode, 0)
self.assertNotIn(b'SyntaxError', err)

def test_issue34979(self):
# BUFSIZ is defined in <stdio.h>, it's platform independent.
# its maximum value may be 8192.
bufsiz = 8192
strlen = round((bufsiz - 1 - 5) / 3)
string = b'\xe6\xb5\x8b' * strlen
src = b's = "' + string + b'"\nprint(s)'
with tempfile.TemporaryDirectory() as tmpd:
fn = os.path.join(tmpd, 'test.py')
with open(fn, 'wb') as fp:
fp.write(src)
res = script_helper.assert_python_ok(fn)
self.assertEqual(res.out.rstrip(), b'\xe6\xb5\x8b' * strlen)

def test_error_message(self):
compile(b'# -*- coding: iso-8859-15 -*-\n', 'dummy', 'exec')
compile(b'\xef\xbb\xbf\n', 'dummy', 'exec')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
fix "SyntaxError: Non-UTF-8 code start with \xe8..." caused by function
decoding_fgets
64 changes: 13 additions & 51 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ fp_setreadl(struct tok_state *tok, const char* enc)
pos = ftell(tok->fp);
if (pos == -1 ||
lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
PyErr_Format(PyExc_OSError, "%d", errno);
return 0;
}

Expand Down Expand Up @@ -551,33 +551,6 @@ static void fp_ungetc(int c, struct tok_state *tok) {
ungetc(c, tok->fp);
}

/* Check whether the characters at s start a valid
UTF-8 sequence. Return the number of characters forming
the sequence if yes, 0 if not. */
static int valid_utf8(const unsigned char* s)
{
int expected = 0;
int length;
if (*s < 0x80)
/* single-byte code */
return 1;
if (*s < 0xc0)
/* following byte */
return 0;
if (*s < 0xE0)
expected = 1;
else if (*s < 0xF0)
expected = 2;
else if (*s < 0xF8)
expected = 3;
else
return 0;
length = expected + 1;
for (; expected; expected--)
if (s[expected] < 0x80 || s[expected] >= 0xC0)
return 0;
return length;
}

/* Read a line of input from TOK. Determine encoding
if necessary. */
Expand All @@ -586,7 +559,6 @@ static char *
decoding_fgets(char *s, int size, struct tok_state *tok)
{
char *line = NULL;
int badchar = 0;
for (;;) {
if (tok->decoding_state == STATE_NORMAL) {
/* We already have a codec associated with
Expand All @@ -612,30 +584,20 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
return error_ret(tok);
}
}
#ifndef PGEN
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (line && !tok->encoding) {
unsigned char *c;
int length;
for (c = (unsigned char *)line; *c; c += length)
if (!(length = valid_utf8(c))) {
badchar = *c;
break;
/* If we can't find coding spec, then try to set default encoding to utf-8 */
if (tok->lineno >= 2 && !tok->encoding) {
char* cs = new_string("utf-8", 5, tok);
int r = fp_setreadl(tok, cs);
if (r) {
tok->encoding = cs;
tok->decoding_state = STATE_NORMAL;
} else {
if (!PyErr_Occurred()) {
PyErr_Format(PyExc_SyntaxError, "setting default encoding to utf-8 failed");
}
return error_ret(tok);
}
}
if (badchar) {
/* Need to add 1 to the line number, since this line
has not been counted, yet. */
PyErr_Format(PyExc_SyntaxError,
"Non-UTF-8 code starting with '\\x%.2x' "
"in file %U on line %i, "
"but no encoding declared; "
"see http://python.org/dev/peps/pep-0263/ for details",
badchar, tok->filename, tok->lineno + 1);
return error_ret(tok);
}
#endif
return line;
}

Expand Down