python · ausaki · Oct 17, 2018 · Oct 17, 2018
diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py
@@ -518,7 +518,7 @@ def test_particularly_evil_undecodable(self):
             with open(fn, "wb") as fp:
                 fp.write(src)
             res = script_helper.run_python_until_end(fn)[0]
-        self.assertIn(b"Non-UTF-8", res.err)
+        self.assertIn(b"SyntaxError", res.err)
 
     def test_yet_more_evil_still_undecodable(self):
         # Issue #25388
@@ -528,7 +528,7 @@ def test_yet_more_evil_still_undecodable(self):
             with open(fn, "wb") as fp:
                 fp.write(src)
             res = script_helper.run_python_until_end(fn)[0]
-        self.assertIn(b"Non-UTF-8", res.err)
+        self.assertIn(b"", res.err)
 
     @support.cpython_only
     def test_compiler_recursion_limit(self):

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
@@ -69,6 +69,20 @@ def test_20731(self):
         self.assertEqual(sub.returncode, 0)
         self.assertNotIn(b'SyntaxError', err)
 
+    def test_issue34979(self):
+        # BUFSIZ is defined in <stdio.h>, it's platform independent.
+        # its maximum value may be 8192.
+        bufsiz = 8192
+        strlen = round((bufsiz - 1 - 5) / 3)
+        string = b'\xe6\xb5\x8b' * strlen
+        src = b's = "' + string +  b'"\nprint(s)'
+        with tempfile.TemporaryDirectory() as tmpd:
+            fn = os.path.join(tmpd, 'test.py')
+            with open(fn, 'wb') as fp:
+                fp.write(src)
+            res = script_helper.assert_python_ok(fn)
+        self.assertEqual(res.out.rstrip(), b'\xe6\xb5\x8b' * strlen)
+
     def test_error_message(self):
         compile(b'# -*- coding: iso-8859-15 -*-\n', 'dummy', 'exec')
         compile(b'\xef\xbb\xbf\n', 'dummy', 'exec')

diff --git a/Misc/NEWS.d/next/Core and Builtins/2018-10-17-16-16-13.bpo-34979.vCBvTX.rst b/Misc/NEWS.d/next/Core and Builtins/2018-10-17-16-16-13.bpo-34979.vCBvTX.rst
@@ -0,0 +1,2 @@
+fix "SyntaxError: Non-UTF-8 code start with \xe8..." caused by function
+decoding_fgets
@@ -509,7 +509,7 @@ fp_setreadl(struct tok_state *tok, const char* enc)
     pos = ftell(tok->fp);
     if (pos == -1 ||
         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
-        PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
+        PyErr_Format(PyExc_OSError, "%d", errno);
         return 0;
     }
 
@@ -551,33 +551,6 @@ static void fp_ungetc(int c, struct tok_state *tok) {
     ungetc(c, tok->fp);
 }
 
-/* Check whether the characters at s start a valid
-   UTF-8 sequence. Return the number of characters forming
-   the sequence if yes, 0 if not.  */
-static int valid_utf8(const unsigned char* s)
-{
-    int expected = 0;
-    int length;
-    if (*s < 0x80)
-        /* single-byte code */
-        return 1;
-    if (*s < 0xc0)
-        /* following byte */
-        return 0;
-    if (*s < 0xE0)
-        expected = 1;
-    else if (*s < 0xF0)
-        expected = 2;
-    else if (*s < 0xF8)
-        expected = 3;
-    else
-        return 0;
-    length = expected + 1;
-    for (; expected; expected--)
-        if (s[expected] < 0x80 || s[expected] >= 0xC0)
-            return 0;
-    return length;
-}
 
 /* Read a line of input from TOK. Determine encoding
    if necessary.  */
@@ -586,7 +559,6 @@ static char *
 decoding_fgets(char *s, int size, struct tok_state *tok)
 {
     char *line = NULL;
-    int badchar = 0;
     for (;;) {
         if (tok->decoding_state == STATE_NORMAL) {
             /* We already have a codec associated with
@@ -612,30 +584,20 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
             return error_ret(tok);
         }
     }
-#ifndef PGEN
-    /* The default encoding is UTF-8, so make sure we don't have any
-       non-UTF-8 sequences in it. */
-    if (line && !tok->encoding) {
-        unsigned char *c;
-        int length;
-        for (c = (unsigned char *)line; *c; c += length)
-            if (!(length = valid_utf8(c))) {
-                badchar = *c;
-                break;
+    /* If we can't find coding spec, then try to set default encoding to utf-8 */
+    if (tok->lineno >= 2 && !tok->encoding) {
+        char* cs = new_string("utf-8", 5, tok);
+        int r = fp_setreadl(tok, cs);
+        if (r) {
+            tok->encoding = cs;
+            tok->decoding_state = STATE_NORMAL;
+        } else {
+            if (!PyErr_Occurred()) {
+                PyErr_Format(PyExc_SyntaxError, "setting default encoding to utf-8 failed");
             }
+            return error_ret(tok);
+        }
     }
-    if (badchar) {
-        /* Need to add 1 to the line number, since this line
-           has not been counted, yet.  */
-        PyErr_Format(PyExc_SyntaxError,
-                "Non-UTF-8 code starting with '\\x%.2x' "
-                "in file %U on line %i, "
-                "but no encoding declared; "
-                "see http://python.org/dev/peps/pep-0263/ for details",
-                badchar, tok->filename, tok->lineno + 1);
-        return error_ret(tok);
-    }
-#endif
     return line;
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		fix "SyntaxError: Non-UTF-8 code start with \xe8..." caused by function
		decoding_fgets