From 17d65547df55eaefe077c45242a7f2d175961dfd Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 12 Oct 2023 09:34:35 +0200 Subject: [PATCH] gh-104169: Fix test_peg_generator after tokenizer refactoring (#110727) * Fix test_peg_generator after tokenizer refactoring * Remove references to tokenizer.c in comments etc. --- Lib/test/test_exceptions.py | 2 +- Lib/test/test_source_encoding.py | 2 +- Lib/test/test_tokenize.py | 4 ++-- Lib/tokenize.py | 2 +- Modules/config.c.in | 2 +- Parser/myreadline.c | 4 ++-- Parser/string_parser.c | 5 +++-- Python/traceback.c | 2 +- Tools/c-analyzer/TODO | 4 ++-- Tools/peg_generator/pegen/build.py | 11 ++++++++++- 10 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 106baf959a6898..05a89e7705e90f 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -253,7 +253,7 @@ def testSyntaxErrorOffset(self): check('try:\n pass\nexcept*:\n pass', 3, 8) check('try:\n pass\nexcept*:\n pass\nexcept* ValueError:\n pass', 3, 8) - # Errors thrown by tokenizer.c + # Errors thrown by the tokenizer check('(0x+1)', 1, 3) check('x = 0xI', 1, 6) check('0010 + 2', 1, 1) diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 27871378f1c79e..61b00778f8361c 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -255,7 +255,7 @@ class UTF8ValidatorTest(unittest.TestCase): def test_invalid_utf8(self): # This is a port of test_utf8_decode_invalid_sequences in # test_unicode.py to exercise the separate utf8 validator in - # Parser/tokenizer.c used when reading source files. + # Parser/tokenizer/helpers.c used when reading source files. # That file is written using low-level C file I/O, so the only way to # test it is to write actual files to disk. diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 06517acb0b2439..41b9ebe3374d62 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1435,7 +1435,7 @@ def test_cookie_second_line_empty_first_line(self): self.assertEqual(consumed_lines, expected) def test_latin1_normalization(self): - # See get_normal_name() in tokenizer.c. + # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", "iso-8859-1-unix", "iso-latin-1-mac") for encoding in encodings: @@ -1460,7 +1460,7 @@ def test_syntaxerror_latin1(self): def test_utf8_normalization(self): - # See get_normal_name() in tokenizer.c. + # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix") for encoding in encodings: for rep in ("-", "_"): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index c21876fb403d8f..0ab1893d42f72f 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -298,7 +298,7 @@ def untokenize(iterable): def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" + """Imitates get_normal_name in Parser/tokenizer/helpers.c.""" # Only care about the first 12 characters. enc = orig_enc[:12].lower().replace("_", "-") if enc == "utf-8" or enc.startswith("utf-8-"): diff --git a/Modules/config.c.in b/Modules/config.c.in index 6081f95759538f..53b4fb285498d0 100644 --- a/Modules/config.c.in +++ b/Modules/config.c.in @@ -45,7 +45,7 @@ struct _inittab _PyImport_Inittab[] = { /* This lives in Python/Python-ast.c */ {"_ast", PyInit__ast}, - /* This lives in Python/Python-tokenizer.c */ + /* This lives in Python/Python-tokenize.c */ {"_tokenize", PyInit__tokenize}, /* These entries are here for sys.builtin_module_names */ diff --git a/Parser/myreadline.c b/Parser/myreadline.c index 719a178f244a28..1825665354844b 100644 --- a/Parser/myreadline.c +++ b/Parser/myreadline.c @@ -1,5 +1,5 @@ -/* Readline interface for tokenizer.c and [raw_]input() in bltinmodule.c. +/* Readline interface for the tokenizer and [raw_]input() in bltinmodule.c. By default, or when stdin is not a tty device, we have a super simple my_readline function using fgets. Optionally, we can use the GNU readline library. @@ -364,7 +364,7 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *) = NULL; -/* Interface used by tokenizer.c and bltinmodule.c */ +/* Interface used by file_tokenizer.c and bltinmodule.c */ char * PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt) diff --git a/Parser/string_parser.c b/Parser/string_parser.c index c5f421844e9c52..f1e027765c86b9 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -14,8 +14,9 @@ static int warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) { unsigned char c = *first_invalid_escape; - if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { // in this case the tokenizer has already emitted a warning, - // see tokenizer.c:warn_invalid_escape_sequence + if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { + // in this case the tokenizer has already emitted a warning, + // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence return 0; } diff --git a/Python/traceback.c b/Python/traceback.c index 5de1bff9943c6c..f786144eda217c 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -32,7 +32,7 @@ #define MAX_FRAME_DEPTH 100 #define MAX_NTHREADS 100 -/* Function from Parser/tokenizer.c */ +/* Function from Parser/tokenizer/file_tokenizer.c */ extern char* _PyTokenizer_FindEncodingFilename(int, PyObject *); /*[clinic input] diff --git a/Tools/c-analyzer/TODO b/Tools/c-analyzer/TODO index 27a535814ea52b..3d599538510bd9 100644 --- a/Tools/c-analyzer/TODO +++ b/Tools/c-analyzer/TODO @@ -428,8 +428,8 @@ Objects/typeobject.c:type_new():PyId___slots__ _Py_IDENTIFIER( Objects/unicodeobject.c:unicodeiter_reduce():PyId_iter _Py_IDENTIFIER(iter) Objects/weakrefobject.c:proxy_bytes():PyId___bytes__ _Py_IDENTIFIER(__bytes__) Objects/weakrefobject.c:weakref_repr():PyId___name__ _Py_IDENTIFIER(__name__) -Parser/tokenizer.c:fp_setreadl():PyId_open _Py_IDENTIFIER(open) -Parser/tokenizer.c:fp_setreadl():PyId_readline _Py_IDENTIFIER(readline) +Parser/tokenizer/file_tokenizer.c:fp_setreadl():PyId_open _Py_IDENTIFIER(open) +Parser/tokenizer/file_tokenizer.c:fp_setreadl():PyId_readline _Py_IDENTIFIER(readline) Python/Python-ast.c:ast_type_reduce():PyId___dict__ _Py_IDENTIFIER(__dict__) Python/Python-ast.c:make_type():PyId___module__ _Py_IDENTIFIER(__module__) Python/_warnings.c:PyId_stderr _Py_IDENTIFIER(stderr) diff --git a/Tools/peg_generator/pegen/build.py b/Tools/peg_generator/pegen/build.py index 6b04ae9ec7025c..30bfb31471c7b2 100644 --- a/Tools/peg_generator/pegen/build.py +++ b/Tools/peg_generator/pegen/build.py @@ -123,7 +123,14 @@ def compile_c_extension( common_sources = [ str(MOD_DIR.parent.parent.parent / "Python" / "Python-ast.c"), str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"), - str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "lexer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "state.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "buffer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "string_tokenizer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "file_tokenizer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "utf8_tokenizer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "readline_tokenizer.c"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "helpers.c"), str(MOD_DIR.parent.parent.parent / "Parser" / "pegen.c"), str(MOD_DIR.parent.parent.parent / "Parser" / "pegen_errors.c"), str(MOD_DIR.parent.parent.parent / "Parser" / "action_helpers.c"), @@ -133,6 +140,8 @@ def compile_c_extension( include_dirs = [ str(MOD_DIR.parent.parent.parent / "Include" / "internal"), str(MOD_DIR.parent.parent.parent / "Parser"), + str(MOD_DIR.parent.parent.parent / "Parser" / "lexer"), + str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer"), ] extension = Extension( extension_name,