Merge branch 'revert-multiprocess-htmlt-to-text' into realtime-ui

2025-05-08 18:55:04 +02:00
parent 23ac6c37a1 24b9e4dc83
commit 963cbcc61e
3 changed files with 19 additions and 50 deletions
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -437,60 +437,26 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False


 # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-import os

-def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
+
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig
-    try:
-        if render_anchor_tag_content:
-            parser_config = ParserConfig(
-                annotation_rules={"a": ["hyperlink"]},
-                display_links=True
-            )
-        else:
-            parser_config = None

-        if is_rss:
-            html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
-            html_content = re.sub(r'</title>', r'</h1>', html_content)
+    if render_anchor_tag_content:
+        parser_config = ParserConfig(
+            annotation_rules={"a": ["hyperlink"]},
+            display_links=True
+        )
+    else:
+        parser_config = None

-        text_content = get_text(html_content, config=parser_config)
+    if is_rss:
+        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
+        html_content = re.sub(r'</title>', r'</h1>', html_content)

-        with open(temp_file_path, "w", encoding="utf-8") as f:
-            f.write(text_content)
-
-    except Exception as e:
-        # Write error to file so the parent can read it
-        with open(temp_file_path, "w", encoding="utf-8") as f:
-            f.write(f"[ERROR] {e}")
-
-import tempfile
-from multiprocessing import Process
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
-
-
-    with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
-        temp_file_path = tmp_file.name
-
-    p = Process(
-        target=html_to_text_sub_worker,
-        args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
-    )
-    p.start()
-    p.join(timeout)
-
-    if p.is_alive():
-        p.terminate()
-        p.join()
-
-    try:
-        with open(temp_file_path, "r", encoding="utf-8") as f:
-            result = f.read()
-    finally:
-        os.remove(temp_file_path)
-
-    return result
+    text_content = get_text(html_content, config=parser_config)
+    return text_content

 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -38,6 +38,9 @@ pytest tests/test_backend.py
 pytest tests/test_rss.py
 pytest tests/test_unique_lines.py

+# Try high concurrency
+FETCH_WORKERS=130 pytest  tests/test_history_consistency.py -v -l
+
 # Check file:// will pickup a file when enabled
 echo "Hello world" > /tmp/test-file.txt
 ALLOW_FILE_URI=yes pytest tests/test_security.py
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs

 def test_consistent_history(client, live_server, measure_memory_usage):
    live_server_setup(live_server)
-
-    r = range(1, 30)
+    workers = int(os.getenv("FETCH_WORKERS", 10))
+    r = range(1, 10+workers)

    for one in r:
        test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)