Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Include/fileutils.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(

#endif /* Py_LIMITED_API */


#ifdef Py_BUILD_CORE
PyAPI_FUNC(int) _Py_GetForceASCII(void);
#endif

#ifdef __cplusplus
}
#endif
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
On HP-UX with C or POSIX locale, sys.getfilesystemencoding() now returns
"ascii" instead of "roman8" (when the UTF-8 Mode is disabled and the C locale
is not coerced).
15 changes: 9 additions & 6 deletions Python/coreconfig.c
Original file line number Diff line number Diff line change
Expand Up @@ -828,18 +828,21 @@ config_read_complex_options(_PyCoreConfig *config)
static void
config_init_locale(_PyCoreConfig *config)
{
if (_Py_LegacyLocaleDetected()) {
if (config->coerce_c_locale < 0) {
/* The C locale enables the C locale coercion (PEP 538) */
if (config->coerce_c_locale < 0) {
if (_Py_LegacyLocaleDetected()) {
config->coerce_c_locale = 1;
}
}

#ifndef MS_WINDOWS
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
if (ctype_loc != NULL
&& (strcmp(ctype_loc, "C") == 0 || strcmp(ctype_loc, "POSIX") == 0)) {
if (config->utf8_mode < 0) {
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
if (config->utf8_mode < 0) {
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
if (ctype_loc != NULL
&& (strcmp(ctype_loc, "C") == 0
|| strcmp(ctype_loc, "POSIX") == 0))
{
config->utf8_mode = 1;
}
}
Expand Down
104 changes: 71 additions & 33 deletions Python/fileutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ _Py_device_encoding(int fd)

extern int _Py_normalize_encoding(const char *, char *, size_t);

/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale.
On these operating systems, nl_langinfo(CODESET) announces an alias of the
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
ASCII encoding, whereas mbstowcs() and wcstombs() functions use the
ISO-8859-1 encoding. The problem is that os.fsencode() and os.fsdecode() use
locale.getpreferredencoding() codec. For example, if command line arguments
Expand All @@ -86,6 +86,10 @@ extern int _Py_normalize_encoding(const char *, char *, size_t);
workaround is also enabled on error, for example if getting the locale
failed.

On HP-UX with the C locale or the POSIX locale, nl_langinfo(CODESET)
announces "roman8" but mbstowcs() uses Latin1 in practice. Force also the
ASCII encoding in this case.

Values of force_ascii:

1: the workaround is used: Py_EncodeLocale() uses
Expand All @@ -100,13 +104,46 @@ static int force_ascii = -1;
static int
check_force_ascii(void)
{
char *loc;
char *loc = setlocale(LC_CTYPE, NULL);
if (loc == NULL) {
goto error;
}
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
/* the LC_CTYPE locale is different than C and POSIX */
return 0;
}

#if defined(HAVE_LANGINFO_H) && defined(CODESET)
char *codeset, **alias;
const char *codeset = nl_langinfo(CODESET);
if (!codeset || codeset[0] == '\0') {
/* CODESET is not set or empty */
goto error;
}

char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
int is_ascii;
unsigned int i;
char* ascii_aliases[] = {
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
goto error;
}

#ifdef __hpux
if (strcmp(encoding, "roman8") == 0) {
unsigned char ch;
wchar_t wch;
size_t res;

ch = (unsigned char)0xA7;
res = mbstowcs(&wch, (char*)&ch, 1);
if (res != (size_t)-1 && wch == L'\xA7') {
/* On HP-UX withe C locale or the POSIX locale,
nl_langinfo(CODESET) announces "roman8", whereas mbstowcs() uses
Latin1 encoding in practice. Force ASCII in this case.

Roman8 decodes 0xA7 to U+00CF. Latin1 decodes 0xA7 to U+00A7. */
return 1;
}
}
#else
const char* ascii_aliases[] = {
"ascii",
/* Aliases from Lib/encodings/aliases.py */
"646",
Expand All @@ -123,27 +160,9 @@ check_force_ascii(void)
"us_ascii",
NULL
};
#endif

loc = setlocale(LC_CTYPE, NULL);
if (loc == NULL)
goto error;
if (strcmp(loc, "C") != 0 && strcmp(loc, "POSIX") != 0) {
/* the LC_CTYPE locale is different than C */
return 0;
}

#if defined(HAVE_LANGINFO_H) && defined(CODESET)
codeset = nl_langinfo(CODESET);
if (!codeset || codeset[0] == '\0') {
/* CODESET is not set or empty */
goto error;
}
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding)))
goto error;

is_ascii = 0;
for (alias=ascii_aliases; *alias != NULL; alias++) {
int is_ascii = 0;
for (const char **alias=ascii_aliases; *alias != NULL; alias++) {
if (strcmp(encoding, *alias) == 0) {
is_ascii = 1;
break;
Expand All @@ -154,13 +173,14 @@ check_force_ascii(void)
return 0;
}

for (i=0x80; i<0xff; i++) {
unsigned char ch;
wchar_t wch;
for (unsigned int i=0x80; i<=0xff; i++) {
char ch[1];
wchar_t wch[1];
size_t res;

ch = (unsigned char)i;
res = mbstowcs(&wch, (char*)&ch, 1);
unsigned uch = (unsigned char)i;
ch[0] = (char)uch;
res = mbstowcs(wch, ch, 1);
if (res != (size_t)-1) {
/* decoding a non-ASCII character from the locale encoding succeed:
the locale encoding is not ASCII, force ASCII */
Expand All @@ -169,17 +189,29 @@ check_force_ascii(void)
}
/* None of the bytes in the range 0x80-0xff can be decoded from the locale
encoding: the locale encoding is really ASCII */
#endif /* !defined(__hpux) */
return 0;
#else
/* nl_langinfo(CODESET) is not available: always force ASCII */
return 1;
#endif
#endif /* defined(HAVE_LANGINFO_H) && defined(CODESET) */

error:
/* if an error occurred, force the ASCII encoding */
return 1;
}


int
_Py_GetForceASCII(void)
{
if (force_ascii == -1) {
force_ascii = check_force_ascii();
}
return force_ascii;
}


static int
encode_ascii(const wchar_t *text, char **str,
size_t *error_pos, const char **reason,
Expand Down Expand Up @@ -234,6 +266,12 @@ encode_ascii(const wchar_t *text, char **str,
*str = result;
return 0;
}
#else
int
_Py_GetForceASCII(void)
{
return 0;
}
#endif /* !defined(__APPLE__) && !defined(__ANDROID__) && !defined(MS_WINDOWS) */


Expand Down
30 changes: 17 additions & 13 deletions Python/pylifecycle.c
Original file line number Diff line number Diff line change
Expand Up @@ -1576,21 +1576,25 @@ initfsencoding(PyInterpreterState *interp)
Py_FileSystemDefaultEncodeErrors = "surrogatepass";
}
#else
if (Py_FileSystemDefaultEncoding == NULL &&
interp->core_config.utf8_mode)
{
Py_FileSystemDefaultEncoding = "utf-8";
Py_HasFileSystemDefaultEncoding = 1;
}
else if (Py_FileSystemDefaultEncoding == NULL) {
Py_FileSystemDefaultEncoding = get_locale_encoding();
if (Py_FileSystemDefaultEncoding == NULL) {
return _Py_INIT_ERR("Unable to get the locale encoding");
if (Py_FileSystemDefaultEncoding == NULL) {
if (interp->core_config.utf8_mode) {
Py_FileSystemDefaultEncoding = "utf-8";
Py_HasFileSystemDefaultEncoding = 1;
}
else if (_Py_GetForceASCII()) {
Py_FileSystemDefaultEncoding = "ascii";
Py_HasFileSystemDefaultEncoding = 1;
}
else {
Py_FileSystemDefaultEncoding = get_locale_encoding();
if (Py_FileSystemDefaultEncoding == NULL) {
return _Py_INIT_ERR("Unable to get the locale encoding");
}

Py_HasFileSystemDefaultEncoding = 0;
interp->fscodec_initialized = 1;
return _Py_INIT_OK();
Py_HasFileSystemDefaultEncoding = 0;
interp->fscodec_initialized = 1;
return _Py_INIT_OK();
}
}
#endif

Expand Down