Skip to content

Commit

Permalink
pythongh-104169: Refactor tokenizer into lexer and wrappers (python#1…
Browse files Browse the repository at this point in the history
…10684)

* The lexer, which include the actual lexeme producing logic, goes into
  the `lexer` directory.
* The wrappers, one wrapper per input mode (file, string, utf-8, and
  readline), go into the `tokenizer` directory and include logic for
  creating a lexer instance and managing the buffer for different modes.
---------

Co-authored-by: Pablo Galindo <[email protected]>
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored and Glyphack committed Jan 27, 2024
1 parent ee0b11b commit d9012fa
Show file tree
Hide file tree
Showing 29 changed files with 3,185 additions and 2,988 deletions.
22 changes: 20 additions & 2 deletions Makefile.pre.in
Original file line number Diff line number Diff line change
Expand Up @@ -347,20 +347,36 @@ PEGEN_OBJS= \
Parser/string_parser.o \
Parser/peg_api.o

TOKENIZER_OBJS= \
Parser/lexer/buffer.o \
Parser/lexer/lexer.o \
Parser/lexer/state.o \
Parser/tokenizer/file_tokenizer.o \
Parser/tokenizer/readline_tokenizer.o \
Parser/tokenizer/string_tokenizer.o \
Parser/tokenizer/utf8_tokenizer.o \
Parser/tokenizer/helpers.o

PEGEN_HEADERS= \
$(srcdir)/Include/internal/pycore_parser.h \
$(srcdir)/Parser/pegen.h \
$(srcdir)/Parser/string_parser.h

TOKENIZER_HEADERS= \
Parser/lexer/buffer.h \
Parser/lexer/lexer.h \
Parser/lexer/state.h \
Parser/tokenizer/tokenizer.h \
Parser/tokenizer/helpers.h

POBJS= \
Parser/token.o \

PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
PARSER_OBJS= $(POBJS) $(PEGEN_OBJS) $(TOKENIZER_OBJS) Parser/myreadline.o

PARSER_HEADERS= \
$(PEGEN_HEADERS) \
$(srcdir)/Parser/tokenizer.h
$(TOKENIZER_HEADERS)

##########################################################################
# Python
Expand Down Expand Up @@ -1397,6 +1413,8 @@ regen-pegen-metaparser:
.PHONY: regen-pegen
regen-pegen:
@$(MKDIR_P) $(srcdir)/Parser
@$(MKDIR_P) $(srcdir)/Parser/tokenizer
@$(MKDIR_P) $(srcdir)/Parser/lexer
PYTHONPATH=$(srcdir)/Tools/peg_generator $(PYTHON_FOR_REGEN) -m pegen -q c \
$(srcdir)/Grammar/python.gram \
$(srcdir)/Grammar/Tokens \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Split the tokenizer into two separate directories:
- One part includes the actual lexeme producing logic and lives in ``Parser/lexer``.
- The second part wraps the lexer according to the different tokenization modes
we have (string, utf-8, file, interactive, readline) and lives in ``Parser/tokenizer``.
9 changes: 8 additions & 1 deletion PCbuild/_freeze_module.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,14 @@
<ClCompile Include="..\Parser\action_helpers.c" />
<ClCompile Include="..\Parser\string_parser.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\Parser\tokenizer.c" />
<ClCompile Include="..\Parser\lexer\buffer.c" />
<ClCompile Include="..\Parser\lexer\state.c" />
<ClCompile Include="..\Parser\lexer\lexer.c" />
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\helpers.c" />
<ClCompile Include="..\PC\invalid_parameter_handler.c" />
<ClCompile Include="..\PC\msvcrtmodule.c" />
<ClCompile Include="..\PC\winreg.c" />
Expand Down
23 changes: 22 additions & 1 deletion PCbuild/_freeze_module.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,28 @@
<ClCompile Include="..\Parser\token.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer.c">
<ClCompile Include="..\Parser\lexer\lexer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\buffer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\state.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\helpers.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Python\traceback.c">
Expand Down
15 changes: 13 additions & 2 deletions PCbuild/pythoncore.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,11 @@
<ClInclude Include="..\Objects\stringlib\replace.h" />
<ClInclude Include="..\Objects\stringlib\split.h" />
<ClInclude Include="..\Objects\unicodetype_db.h" />
<ClInclude Include="..\Parser\tokenizer.h" />
<ClInclude Include="..\Parser\lexer\state.h" />
<ClInclude Include="..\Parser\lexer\lexer.h" />
<ClInclude Include="..\Parser\lexer\buffer.h" />
<ClInclude Include="..\Parser\tokenizer\helpers.h" />
<ClInclude Include="..\Parser\tokenizer\tokenizer.h" />
<ClInclude Include="..\Parser\string_parser.h" />
<ClInclude Include="..\Parser\pegen.h" />
<ClInclude Include="..\PC\errmap.h" />
Expand Down Expand Up @@ -507,7 +511,14 @@
<ClCompile Include="..\Objects\unionobject.c" />
<ClCompile Include="..\Objects\weakrefobject.c" />
<ClCompile Include="..\Parser\myreadline.c" />
<ClCompile Include="..\Parser\tokenizer.c" />
<ClCompile Include="..\Parser\lexer\state.c" />
<ClCompile Include="..\Parser\lexer\lexer.c" />
<ClCompile Include="..\Parser\lexer\buffer.c" />
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c" />
<ClCompile Include="..\Parser\tokenizer\helpers.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\Parser\pegen.c" />
<ClCompile Include="..\Parser\pegen_errors.c" />
Expand Down
37 changes: 35 additions & 2 deletions PCbuild/pythoncore.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,19 @@
<ClInclude Include="..\Objects\unicodetype_db.h">
<Filter>Objects</Filter>
</ClInclude>
<ClInclude Include="..\Parser\tokenizer.h">
<ClInclude Include="..\Parser\lexer\lexer.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\lexer\state.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\lexer\buffer.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\tokenizer\tokenizer.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\Parser\tokenizer\helpers.h">
<Filter>Parser</Filter>
</ClInclude>
<ClInclude Include="..\PC\errmap.h">
Expand Down Expand Up @@ -1139,7 +1151,28 @@
<ClCompile Include="..\Parser\myreadline.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer.c">
<ClCompile Include="..\Parser\lexer\lexer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\state.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\lexer\buffer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\string_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\file_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\utf8_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\readline_tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\tokenizer\helpers.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\token.c">
Expand Down
1 change: 0 additions & 1 deletion Parser/action_helpers.c
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include <Python.h>

#include "pegen.h"
#include "tokenizer.h"
#include "string_parser.h"
#include "pycore_runtime.h" // _PyRuntime

Expand Down
76 changes: 76 additions & 0 deletions Parser/lexer/buffer.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "Python.h"
#include "errcode.h"

#include "state.h"

/* Traverse and remember all f-string buffers, in order to be able to restore
them after reallocating tok->buf */
void
_PyLexer_remember_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;

for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start_offset = mode->f_string_start - tok->buf;
mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf;
}
}

/* Traverse and restore all f-string buffers after reallocating tok->buf */
void
_PyLexer_restore_fstring_buffers(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;

for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
mode->f_string_start = tok->buf + mode->f_string_start_offset;
mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset;
}
}

/* Read a line of text from TOK into S, using the stream in TOK.
Return NULL on failure, else S.
On entry, tok->decoding_buffer will be one of:
1) NULL: need to call tok->decoding_readline to get a new line
2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
stored the result in tok->decoding_buffer
3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
(in the s buffer) to copy entire contents of the line read
by tok->decoding_readline. tok->decoding_buffer has the overflow.
In this case, tok_readline_recode is called in a loop (with an expanded buffer)
until the buffer ends with a '\n' (or until the end of the file is
reached): see tok_nextc and its calls to tok_reserve_buf.
*/
int
_PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
{
Py_ssize_t cur = tok->cur - tok->buf;
Py_ssize_t oldsize = tok->inp - tok->buf;
Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
if (newsize > tok->end - tok->buf) {
char *newbuf = tok->buf;
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
_PyLexer_remember_fstring_buffers(tok);
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
if (newbuf == NULL) {
tok->done = E_NOMEM;
return 0;
}
tok->buf = newbuf;
tok->cur = tok->buf + cur;
tok->inp = tok->buf + oldsize;
tok->end = tok->buf + newsize;
tok->start = start < 0 ? NULL : tok->buf + start;
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
_PyLexer_restore_fstring_buffers(tok);
}
return 1;
}
10 changes: 10 additions & 0 deletions Parser/lexer/buffer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef _LEXER_BUFFER_H_
#define _LEXER_BUFFER_H_

#include "pyport.h"

void _PyLexer_remember_fstring_buffers(struct tok_state *tok);
void _PyLexer_restore_fstring_buffers(struct tok_state *tok);
int _PyLexer_tok_reserve_buf(struct tok_state *tok, Py_ssize_t size);

#endif
Loading

0 comments on commit d9012fa

Please sign in to comment.