From 24b9e4dc83d88abf829285cffbcf6f964b079a8d Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 8 May 2025 18:54:14 +0200 Subject: [PATCH] Revert multiprocess html to text and add test for high concurrency --- changedetectionio/html_tools.py | 62 +++++-------------- changedetectionio/run_basic_tests.sh | 3 + .../tests/test_history_consistency.py | 4 +- 3 files changed, 19 insertions(+), 50 deletions(-) diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py index 668f5ca0..69807f35 100644 --- a/changedetectionio/html_tools.py +++ b/changedetectionio/html_tools.py @@ -437,60 +437,26 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON -import os -def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False): + +def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str: from inscriptis import get_text from inscriptis.model.config import ParserConfig - try: - if render_anchor_tag_content: - parser_config = ParserConfig( - annotation_rules={"a": ["hyperlink"]}, - display_links=True - ) - else: - parser_config = None - if is_rss: - html_content = re.sub(r'])', r'', r'', html_content) + if render_anchor_tag_content: + parser_config = ParserConfig( + annotation_rules={"a": ["hyperlink"]}, + display_links=True + ) + else: + parser_config = None - text_content = get_text(html_content, config=parser_config) + if is_rss: + html_content = re.sub(r'])', r'', r'', html_content) - with open(temp_file_path, "w", encoding="utf-8") as f: - f.write(text_content) - - except Exception as e: - # Write error to file so the parent can read it - with open(temp_file_path, "w", encoding="utf-8") as f: - f.write(f"[ERROR] {e}") - -import tempfile -from multiprocessing import Process -def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str: - - - with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file: - temp_file_path = tmp_file.name - - p = Process( - target=html_to_text_sub_worker, - args=(temp_file_path, html_content, render_anchor_tag_content, is_rss) - ) - p.start() - p.join(timeout) - - if p.is_alive(): - p.terminate() - p.join() - - try: - with open(temp_file_path, "r", encoding="utf-8") as f: - result = f.read() - finally: - os.remove(temp_file_path) - - return result + text_content = get_text(html_content, config=parser_config) + return text_content # Does LD+JSON exist with a @type=='product' and a .price set anywhere? def has_ldjson_product_info(content): diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh index 7e13b766..2f179fda 100755 --- a/changedetectionio/run_basic_tests.sh +++ b/changedetectionio/run_basic_tests.sh @@ -38,6 +38,9 @@ pytest tests/test_backend.py pytest tests/test_rss.py pytest tests/test_unique_lines.py +# Try high concurrency +FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l + # Check file:// will pickup a file when enabled echo "Hello world" > /tmp/test-file.txt ALLOW_FILE_URI=yes pytest tests/test_security.py diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py index 113c54c0..1558c275 100644 --- a/changedetectionio/tests/test_history_consistency.py +++ b/changedetectionio/tests/test_history_consistency.py @@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs def test_consistent_history(client, live_server, measure_memory_usage): live_server_setup(live_server) - - r = range(1, 30) + workers = int(os.getenv("FETCH_WORKERS", 10)) + r = range(1, 10+workers) for one in r: test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)