pythongh-104169: Refactor tokenizer into lexer and wrappers (python#1…

…10684) * The lexer, which include the actual lexeme producing logic, goes into the `lexer` directory. * The wrappers, one wrapper per input mode (file, string, utf-8, and readline), go into the `tokenizer` directory and include logic for creating a lexer instance and managing the buffer for different modes. --------- Co-authored-by: Pablo Galindo <[email protected]> Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Glyphack · Jan 27, 2024 · d9012fa · d9012fa
1 parent ee0b11b
commit d9012fa
Show file tree

Hide file tree

Showing 29 changed files with 3,185 additions and 2,988 deletions.
diff --git a/Makefile.pre.in b/Makefile.pre.in
@@ -347,20 +347,36 @@ PEGEN_OBJS=		\
 		Parser/string_parser.o \
 		Parser/peg_api.o
 
+TOKENIZER_OBJS=		\
+		Parser/lexer/buffer.o \
+		Parser/lexer/lexer.o \
+		Parser/lexer/state.o \
+		Parser/tokenizer/file_tokenizer.o \
+		Parser/tokenizer/readline_tokenizer.o \
+		Parser/tokenizer/string_tokenizer.o \
+		Parser/tokenizer/utf8_tokenizer.o \
+		Parser/tokenizer/helpers.o
 
 PEGEN_HEADERS= \
 		$(srcdir)/Include/internal/pycore_parser.h \
 		$(srcdir)/Parser/pegen.h \
 		$(srcdir)/Parser/string_parser.h
 
+TOKENIZER_HEADERS= \
+		Parser/lexer/buffer.h \
+		Parser/lexer/lexer.h \
+		Parser/lexer/state.h \
+		Parser/tokenizer/tokenizer.h \
+		Parser/tokenizer/helpers.h
+
 POBJS=		\
 		Parser/token.o \
 
-PARSER_OBJS=	$(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
+PARSER_OBJS=	$(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o
 
 PARSER_HEADERS= \
 		$(PEGEN_HEADERS) \
-		$(srcdir)/Parser/tokenizer.h
+		$(TOKENIZER_HEADERS)
 
 ##########################################################################
 # Python
@@ -1397,6 +1413,8 @@ regen-pegen-metaparser:
 .PHONY: regen-pegen
 regen-pegen:
 	@$(MKDIR_P) $(srcdir)/Parser
+	@$(MKDIR_P) $(srcdir)/Parser/tokenizer
+	@$(MKDIR_P) $(srcdir)/Parser/lexer
 	PYTHONPATH=$(srcdir)/Tools/peg_generator $(PYTHON_FOR_REGEN) -m pegen -q c \
 		$(srcdir)/Grammar/python.gram \
 		$(srcdir)/Grammar/Tokens \

diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-10-11-12-48-03.gh-issue-104169.bPoX8u.rst b/Misc/NEWS.d/next/Core and Builtins/2023-10-11-12-48-03.gh-issue-104169.bPoX8u.rst
@@ -0,0 +1,4 @@
+Split the tokenizer into two separate directories:
+- One part includes the actual lexeme producing logic and lives in ``Parser/lexer``.
+- The second part wraps the lexer according to the different tokenization modes
+  we have (string, utf-8, file, interactive, readline) and lives in ``Parser/tokenizer``.
diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj
@@ -172,7 +172,14 @@
     <ClCompile Include="..\Parser\action_helpers.c" />
     <ClCompile Include="..\Parser\string_parser.c" />
     <ClCompile Include="..\Parser\token.c" />
-    <ClCompile Include="..\Parser\tokenizer.c" />
+    <ClCompile Include="..\Parser\lexer\buffer.c" />
+    <ClCompile Include="..\Parser\lexer\state.c" />
+    <ClCompile Include="..\Parser\lexer\lexer.c" />
+    <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\helpers.c" />
     <ClCompile Include="..\PC\invalid_parameter_handler.c" />
     <ClCompile Include="..\PC\msvcrtmodule.c" />
     <ClCompile Include="..\PC\winreg.c" />

diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters
@@ -397,7 +397,28 @@
     <ClCompile Include="..\Parser\token.c">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\Parser\tokenizer.c">
+    <ClCompile Include="..\Parser\lexer\lexer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\lexer\buffer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\lexer\state.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\helpers.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\Python\traceback.c">

diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj
@@ -362,7 +362,11 @@
     <ClInclude Include="..\Objects\stringlib\replace.h" />
     <ClInclude Include="..\Objects\stringlib\split.h" />
     <ClInclude Include="..\Objects\unicodetype_db.h" />
-    <ClInclude Include="..\Parser\tokenizer.h" />
+    <ClInclude Include="..\Parser\lexer\state.h" />
+    <ClInclude Include="..\Parser\lexer\lexer.h" />
+    <ClInclude Include="..\Parser\lexer\buffer.h" />
+    <ClInclude Include="..\Parser\tokenizer\helpers.h" />
+    <ClInclude Include="..\Parser\tokenizer\tokenizer.h" />
     <ClInclude Include="..\Parser\string_parser.h" />
     <ClInclude Include="..\Parser\pegen.h" />
     <ClInclude Include="..\PC\errmap.h" />
@@ -507,7 +511,14 @@
     <ClCompile Include="..\Objects\unionobject.c" />
     <ClCompile Include="..\Objects\weakrefobject.c" />
     <ClCompile Include="..\Parser\myreadline.c" />
-    <ClCompile Include="..\Parser\tokenizer.c" />
+    <ClCompile Include="..\Parser\lexer\state.c" />
+    <ClCompile Include="..\Parser\lexer\lexer.c" />
+    <ClCompile Include="..\Parser\lexer\buffer.c" />
+    <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
+    <ClCompile Include="..\Parser\tokenizer\helpers.c" />
     <ClCompile Include="..\Parser\token.c" />
     <ClCompile Include="..\Parser\pegen.c" />
     <ClCompile Include="..\Parser\pegen_errors.c" />

diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters
@@ -291,7 +291,19 @@
     <ClInclude Include="..\Objects\unicodetype_db.h">
       <Filter>Objects</Filter>
     </ClInclude>
-    <ClInclude Include="..\Parser\tokenizer.h">
+    <ClInclude Include="..\Parser\lexer\lexer.h">
+      <Filter>Parser</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Parser\lexer\state.h">
+      <Filter>Parser</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Parser\lexer\buffer.h">
+      <Filter>Parser</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Parser\tokenizer\tokenizer.h">
+      <Filter>Parser</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Parser\tokenizer\helpers.h">
       <Filter>Parser</Filter>
     </ClInclude>
     <ClInclude Include="..\PC\errmap.h">
@@ -1139,7 +1151,28 @@
     <ClCompile Include="..\Parser\myreadline.c">
       <Filter>Parser</Filter>
     </ClCompile>
-    <ClCompile Include="..\Parser\tokenizer.c">
+    <ClCompile Include="..\Parser\lexer\lexer.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\lexer\state.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\lexer\buffer.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
+    <ClCompile Include="..\Parser\tokenizer\helpers.c">
       <Filter>Parser</Filter>
     </ClCompile>
     <ClCompile Include="..\Parser\token.c">

diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
@@ -1,7 +1,6 @@
 #include <Python.h>
 
 #include "pegen.h"
-#include "tokenizer.h"
 #include "string_parser.h"
 #include "pycore_runtime.h"         // _PyRuntime
 

diff --git a/Parser/lexer/buffer.c b/Parser/lexer/buffer.c
@@ -0,0 +1,76 @@
+#include "Python.h"
+#include "errcode.h"
+
+#include "state.h"
+
+/* Traverse and remember all f-string buffers, in order to be able to restore
+   them after reallocating tok->buf */
+void
+_PyLexer_remember_fstring_buffers(struct tok_state *tok)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        mode->f_string_start_offset = mode->f_string_start - tok->buf;
+        mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
+    }
+}
+
+/* Traverse and restore all f-string buffers after reallocating tok->buf */
+void
+_PyLexer_restore_fstring_buffers(struct tok_state *tok)
+{
+    int index;
+    tokenizer_mode *mode;
+
+    for (index = tok->tok_mode_stack_index; index >= 0; --index) {
+        mode = &(tok->tok_mode_stack[index]);
+        mode->f_string_start = tok->buf + mode->f_string_start_offset;
+        mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
+    }
+}
+
+/* Read a line of text from TOK into S, using the stream in TOK.
+   Return NULL on failure, else S.
+
+   On entry, tok->decoding_buffer will be one of:
+     1) NULL: need to call tok->decoding_readline to get a new line
+     2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
+       stored the result in tok->decoding_buffer
+     3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
+       (in the s buffer) to copy entire contents of the line read
+       by tok->decoding_readline.  tok->decoding_buffer has the overflow.
+       In this case, tok_readline_recode is called in a loop (with an expanded buffer)
+       until the buffer ends with a '\n' (or until the end of the file is
+       reached): see tok_nextc and its calls to tok_reserve_buf.
+*/
+int
+_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
+{
+    Py_ssize_t cur = tok->cur - tok->buf;
+    Py_ssize_t oldsize = tok->inp - tok->buf;
+    Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
+    if (newsize > tok->end - tok->buf) {
+        char *newbuf = tok->buf;
+        Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
+        Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
+        Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
+        _PyLexer_remember_fstring_buffers(tok);
+        newbuf = (char *)PyMem_Realloc(newbuf, newsize);
+        if (newbuf == NULL) {
+            tok->done = E_NOMEM;
+            return 0;
+        }
+        tok->buf = newbuf;
+        tok->cur = tok->buf + cur;
+        tok->inp = tok->buf + oldsize;
+        tok->end = tok->buf + newsize;
+        tok->start = start < 0 ? NULL : tok->buf + start;
+        tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
+        tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
+        _PyLexer_restore_fstring_buffers(tok);
+    }
+    return 1;
+}
diff --git a/Parser/lexer/buffer.h b/Parser/lexer/buffer.h
@@ -0,0 +1,10 @@
+#ifndef _LEXER_BUFFER_H_
+#define _LEXER_BUFFER_H_
+
+#include "pyport.h"
+
+void _PyLexer_remember_fstring_buffers(struct tok_state *tok);
+void _PyLexer_restore_fstring_buffers(struct tok_state *tok);
+int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);
+
+#endif