From 00b76aaaee6e96f541211b974962c041208361b4 Mon Sep 17 00:00:00 2001 From: eric Date: Thu, 5 Sep 2024 09:45:58 +0200 Subject: [PATCH] fix issue-109 (use unique temp files for input/output of ExtractArticle.js), increase verbosity of tests (tests for above don't pass due to some some extra \n) --- readabilipy/simple_json.py | 16 ++++++++++------ tests/checks.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/readabilipy/simple_json.py b/readabilipy/simple_json.py index 83e83e9..b462097 100644 --- a/readabilipy/simple_json.py +++ b/readabilipy/simple_json.py @@ -44,13 +44,17 @@ def simple_json_from_html_string(html, content_digests=False, node_indexes=False if use_readability: # Write input HTML to temporary file so it is available to the node.js script - with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as f_html: + # It is important that this file be unique in case this function is called concurrently + with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8", prefix="readabilipy") as f_html: f_html.write(html) f_html.close() html_path = f_html.name + # Derive some output name + # (making the assumption this will be unique too) + json_path = html_path + ".json" + # Call Mozilla's Readability.js Readability.parse() function via node, writing output to a temporary file - article_json_path = f_html.name + ".json" jsdir = os.path.join(os.path.dirname(__file__), 'javascript') try: result = subprocess.run( @@ -64,12 +68,12 @@ def simple_json_from_html_string(html, content_digests=False, node_indexes=False print(e.stderr) raise - # Read output of call to Readability.parse() from JSON file and return as Python dictionary - with open(article_json_path, "r", encoding="utf-8") as json_file: + # Read output of call to Readability.parse() from JSON file as Python dictionary + with open(json_path, "r", encoding="utf-8") as json_file: input_json = json.load(json_file) - # Deleting files after processing - os.unlink(article_json_path) + # Delete temporary input and output files after processing + os.unlink(json_path) os.unlink(f_html.name) else: input_json = { diff --git a/tests/checks.py b/tests/checks.py index aec404d..a70bd7d 100644 --- a/tests/checks.py +++ b/tests/checks.py @@ -51,7 +51,7 @@ def check_extract_article(test_filename, expected_filename, content_digests=Fals expected_article_json = json.loads(h.read()) # Test full JSON matches (checks for unexpected fields in either actual or expected JSON) - assert article_json == expected_article_json + assert article_json == expected_article_json, f"{article_json=} != {expected_article_json=}" def check_extract_paragraphs_as_plain_text(test_filename, expected_filename):