From 24b9e4dc83d88abf829285cffbcf6f964b079a8d Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Thu, 8 May 2025 18:54:14 +0200
Subject: [PATCH] Revert multiprocess html to text and add test for high
 concurrency

---
 changedetectionio/html_tools.py               | 62 +++++--------------
 changedetectionio/run_basic_tests.sh          |  3 +
 .../tests/test_history_consistency.py         |  4 +-
 3 files changed, 19 insertions(+), 50 deletions(-)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 668f5ca0..69807f35 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -437,60 +437,26 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
 
 
 # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-import os
 
-def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
+
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
     from inscriptis import get_text
     from inscriptis.model.config import ParserConfig
-    try:
-        if render_anchor_tag_content:
-            parser_config = ParserConfig(
-                annotation_rules={"a": ["hyperlink"]},
-                display_links=True
-            )
-        else:
-            parser_config = None
 
-        if is_rss:
-            html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
-            html_content = re.sub(r'</title>', r'</h1>', html_content)
+    if render_anchor_tag_content:
+        parser_config = ParserConfig(
+            annotation_rules={"a": ["hyperlink"]},
+            display_links=True
+        )
+    else:
+        parser_config = None
 
-        text_content = get_text(html_content, config=parser_config)
+    if is_rss:
+        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
+        html_content = re.sub(r'</title>', r'</h1>', html_content)
 
-        with open(temp_file_path, "w", encoding="utf-8") as f:
-            f.write(text_content)
-
-    except Exception as e:
-        # Write error to file so the parent can read it
-        with open(temp_file_path, "w", encoding="utf-8") as f:
-            f.write(f"[ERROR] {e}")
-
-import tempfile
-from multiprocessing import Process
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
-
-
-    with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
-        temp_file_path = tmp_file.name
-
-    p = Process(
-        target=html_to_text_sub_worker,
-        args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
-    )
-    p.start()
-    p.join(timeout)
-
-    if p.is_alive():
-        p.terminate()
-        p.join()
-
-    try:
-        with open(temp_file_path, "r", encoding="utf-8") as f:
-            result = f.read()
-    finally:
-        os.remove(temp_file_path)
-
-    return result
+    text_content = get_text(html_content, config=parser_config)
+    return text_content
 
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh
index 7e13b766..2f179fda 100755
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -38,6 +38,9 @@ pytest tests/test_backend.py
 pytest tests/test_rss.py
 pytest tests/test_unique_lines.py
 
+# Try high concurrency
+FETCH_WORKERS=130 pytest  tests/test_history_consistency.py -v -l
+
 # Check file:// will pickup a file when enabled
 echo "Hello world" > /tmp/test-file.txt
 ALLOW_FILE_URI=yes pytest tests/test_security.py
diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py
index 113c54c0..1558c275 100644
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
 
 def test_consistent_history(client, live_server, measure_memory_usage):
     live_server_setup(live_server)
-
-    r = range(1, 30)
+    workers = int(os.getenv("FETCH_WORKERS", 10))
+    r = range(1, 10+workers)
 
     for one in r:
         test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)