From 1ec86bd38dae69efb0814cf13125bde00b7d02c9 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Thu, 8 May 2025 18:09:47 +0200
Subject: [PATCH 1/2] Revert multiprocess memory management, was unreliable
 under high concurrency

---
 changedetectionio/html_tools.py               | 19 +++++--------------
 .../tests/test_history_consistency.py         |  5 ++---
 2 files changed, 7 insertions(+), 17 deletions(-)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index ddd5cbef..19c83b94 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -436,7 +436,10 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
     return re.sub(pattern, repl, html_content)
 
 
-def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
+
+
+# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
 
     from inscriptis import get_text
     from inscriptis.model.config import ParserConfig
@@ -472,19 +475,7 @@ def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=F
         html_content = re.sub(r'</title>', r'</h1>', html_content)
 
     text_content = get_text(html_content, config=parser_config)
-    conn.send(text_content)
-    conn.close()
-
-# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
-    from multiprocessing import Process, Pipe
-
-    parent_conn, child_conn = Pipe()
-    p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
-    p.start()
-    text = parent_conn.recv()
-    p.join()
-    return text
+    return text_content
 
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py
index 13943f6a..8e9335a8 100644
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -58,8 +58,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
             assert len(tmp_history) == 1, "History.txt should contain 1 line"
 
         # Should be two files,. the history.txt , and the snapshot.txt
-        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
-                                                     w))
+        files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
+
         # Find the snapshot one
         for fname in files_in_watch_dir:
             if fname != 'history.txt' and 'html' not in fname:
@@ -75,7 +75,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
 
         assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
 
-
     json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
     with open(json_db_file, 'r') as f:
         assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"

From e38f264750f4f4588db16a45d299863f5212ad87 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Thu, 8 May 2025 18:15:16 +0200
Subject: [PATCH 2/2] Avoid pickling issues

---
 changedetectionio/html_tools.py | 79 ++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 32 deletions(-)

diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 19c83b94..668f5ca0 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -436,46 +436,61 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
     return re.sub(pattern, repl, html_content)
 
 
-
-
 # NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
+import os
 
+def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
     from inscriptis import get_text
     from inscriptis.model.config import ParserConfig
+    try:
+        if render_anchor_tag_content:
+            parser_config = ParserConfig(
+                annotation_rules={"a": ["hyperlink"]},
+                display_links=True
+            )
+        else:
+            parser_config = None
 
-    """Converts html string to a string with just the text. If ignoring
-    rendering anchor tag content is enable, anchor tag content are also
-    included in the text
+        if is_rss:
+            html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
+            html_content = re.sub(r'</title>', r'</h1>', html_content)
 
-    :param html_content: string with html content
-    :param render_anchor_tag_content: boolean flag indicating whether to extract
-    hyperlinks (the anchor tag content) together with text. This refers to the
-    'href' inside 'a' tags.
-    Anchor tag content is rendered in the following manner:
-    '[ text ](anchor tag content)'
-    :return: extracted text from the HTML
-    """
-    #  if anchor tag content flag is set to True define a config for
-    #  extracting this content
-    if render_anchor_tag_content:
-        parser_config = ParserConfig(
-            annotation_rules={"a": ["hyperlink"]},
-            display_links=True
-        )
-    # otherwise set config to None/default
-    else:
-        parser_config = None
+        text_content = get_text(html_content, config=parser_config)
 
-    # RSS Mode - Inscriptis will treat `title` as something else.
-    # Make it as a regular block display element (//item/title)
-    # This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
-    if is_rss:
-        html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
-        html_content = re.sub(r'</title>', r'</h1>', html_content)
+        with open(temp_file_path, "w", encoding="utf-8") as f:
+            f.write(text_content)
 
-    text_content = get_text(html_content, config=parser_config)
-    return text_content
+    except Exception as e:
+        # Write error to file so the parent can read it
+        with open(temp_file_path, "w", encoding="utf-8") as f:
+            f.write(f"[ERROR] {e}")
+
+import tempfile
+from multiprocessing import Process
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
+
+
+    with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
+        temp_file_path = tmp_file.name
+
+    p = Process(
+        target=html_to_text_sub_worker,
+        args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
+    )
+    p.start()
+    p.join(timeout)
+
+    if p.is_alive():
+        p.terminate()
+        p.join()
+
+    try:
+        with open(temp_file_path, "r", encoding="utf-8") as f:
+            result = f.read()
+    finally:
+        os.remove(temp_file_path)
+
+    return result
 
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):