Patch #534304: Implement phase 1 of PEP 263.

skrah · Aug 4, 2002 · 00f1e3f · 00f1e3f
1 parent a729daf
commit 00f1e3f
Show file tree

Hide file tree

Showing 13 changed files with 656 additions and 31 deletions.
diff --git a/Doc/ref/ref2.tex b/Doc/ref/ref2.tex
@@ -7,11 +7,14 @@ \chapter{Lexical analysis\label{lexical}}
 \index{parser}
 \index{token}
 
-Python uses the 7-bit \ASCII{} character set for program text and string
-literals. 8-bit characters may be used in string literals and comments
-but their interpretation is platform dependent; the proper way to
-insert 8-bit characters in string literals is by using octal or
-hexadecimal escape sequences.
+Python uses the 7-bit \ASCII{} character set for program text.
+\versionadded[An encoding declaration can be used to indicate that 
+string literals and comments use an encoding different from ASCII.]{2.3}
+For compatibility with older versions, Python only warns if it finds
+8-bit characters; those warnings should be corrected by either declaring
+an explicit encoding, or using escape sequences if those bytes are binary
+data, instead of characters.
+
 
 The run-time character set depends on the I/O devices connected to the
 program but is generally a superset of \ASCII.
@@ -69,6 +72,37 @@ \subsection{Comments\label{comments}}
 \index{hash character}
 
 
+\subsection{Encoding declarations\label{encodings}}
+
+If a comment in the first or second line of the Python script matches
+the regular expression "coding[=:]\s*([\w-_.]+)", this comment is
+processed as an encoding declaration; the first group of this
+expression names the encoding of the source code file. The recommended
+forms of this expression are
+
+\begin{verbatim}
+# -*- coding: <encoding-name> -*-
+\end{verbatim}
+
+which is recognized also by GNU Emacs, and
+
+\begin{verbatim}
+# vim:fileencoding=<encoding-name>
+\end{verbatim}
+
+which is recognized by Bram Moolenar's VIM. In addition, if the first
+bytes of the file are the UTF-8 signature ($'\xef\xbb\xbf'$), the
+declared file encoding is UTF-8 (this is supported, among others, by
+Microsoft's notepad.exe).
+
+If an encoding is declared, the encoding name must be recognized by
+Python. % XXX there should be a list of supported encodings.
+The encoding is used for all lexical analysis, in particular to find
+the end of a string, and to interpret the contents of Unicode literals.
+String literals are converted to Unicode for syntactical analysis,
+then converted back to their original encoding before interpretation
+starts.
+
 \subsection{Explicit line joining\label{explicit-joining}}
 
 Two or more physical lines may be joined into logical lines using

diff --git a/Grammar/Grammar b/Grammar/Grammar
@@ -102,3 +102,6 @@ list_for: 'for' exprlist 'in' testlist_safe [list_iter]
 list_if: 'if' test [list_iter]
 
 testlist1: test (',' test)*
+
+# not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME
diff --git a/Include/errcode.h b/Include/errcode.h
@@ -25,6 +25,7 @@ extern "C" {
 #define E_OVERFLOW      19	/* Node had too many children */
 #define E_TOODEEP	20	/* Too many indentation levels */
 #define E_DEDENT	21	/* No matching outer block for dedent */
+#define E_DECODE	22	/* Error in decoding into Unicode */
 
 #ifdef __cplusplus
 }

diff --git a/Include/graminit.h b/Include/graminit.h
@@ -65,3 +65,4 @@
 #define list_for 320
 #define list_if 321
 #define testlist1 322
+#define encoding_decl 323
diff --git a/Makefile.pre.in b/Makefile.pre.in
@@ -190,15 +190,15 @@ POBJS=		\
 		Parser/node.o \
 		Parser/parser.o \
 		Parser/parsetok.o \
-		Parser/tokenizer.o \
 		Parser/bitset.o \
 		Parser/metagrammar.o
 
-PARSER_OBJS=	$(POBJS) Parser/myreadline.o
+PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/tokenizer.o
 
 PGOBJS=		\
 		Objects/obmalloc.o \
 		Python/mysnprintf.o \
+		Parser/tokenizer_pgen.o \
 		Parser/firstsets.o \
 		Parser/grammar.o \
 		Parser/pgen.o \
@@ -434,6 +434,8 @@ Parser/grammar.o:	$(srcdir)/Parser/grammar.c \
 				$(srcdir)/Include/grammar.h
 Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c
 
+Parser/tokenizer_pgen.o:	$(srcdir)/Parser/tokenizer.c
+
 
 Python/compile.o Python/symtable.o: $(GRAMMAR_H)
 

diff --git a/Misc/NEWS b/Misc/NEWS
@@ -6,6 +6,8 @@ Type/class unification and new-style classes
 
 Core and builtins
 
+- Encoding declarations (PEP 263, phase 1) have been implemented.
+
 - list.sort() has a new implementation.  While cross-platform results
   may vary, and in data-dependent ways, this is much faster on many
   kinds of partially ordered lists than the previous implementation,

diff --git a/Parser/parsetok.c b/Parser/parsetok.c
@@ -8,6 +8,7 @@
 #include "parser.h"
 #include "parsetok.h"
 #include "errcode.h"
+#include "graminit.h"
 
 int Py_TabcheckFlag;
 
@@ -45,8 +46,8 @@ PyParser_ParseStringFlagsFilename(char *s, char *filename,
 		return NULL;
 	}
 
+        tok->filename = filename ? filename : "<string>";
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
-		tok->filename = filename ? filename : "<string>";
 		tok->altwarning = (tok->filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@@ -78,8 +79,8 @@ PyParser_ParseFileFlags(FILE *fp, char *filename, grammar *g, int start,
 		err_ret->error = E_NOMEM;
 		return NULL;
 	}
+	tok->filename = filename;
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
-		tok->filename = filename;
 		tok->altwarning = (filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@@ -185,6 +186,13 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
 				err_ret->text[len] = '\0';
 			}
 		}
+	} else if (tok->encoding != NULL) {
+		node* r = PyNode_New(encoding_decl);
+		r->n_str = tok->encoding;
+		r->n_nchildren = 1;
+		r->n_child = n;
+		tok->encoding = NULL;
+		n = r;
 	}
 
 	PyTokenizer_Free(tok);