diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
index 668f5ca0..69807f35 100644
--- a/changedetectionio/html_tools.py
+++ b/changedetectionio/html_tools.py
@@ -437,60 +437,26 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
-import os
-def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
+
+def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
- try:
- if render_anchor_tag_content:
- parser_config = ParserConfig(
- annotation_rules={"a": ["hyperlink"]},
- display_links=True
- )
- else:
- parser_config = None
- if is_rss:
- html_content = re.sub(r'
])', r'', r'
', html_content)
+ if render_anchor_tag_content:
+ parser_config = ParserConfig(
+ annotation_rules={"a": ["hyperlink"]},
+ display_links=True
+ )
+ else:
+ parser_config = None
- text_content = get_text(html_content, config=parser_config)
+ if is_rss:
+ html_content = re.sub(r'])', r'', r'
', html_content)
- with open(temp_file_path, "w", encoding="utf-8") as f:
- f.write(text_content)
-
- except Exception as e:
- # Write error to file so the parent can read it
- with open(temp_file_path, "w", encoding="utf-8") as f:
- f.write(f"[ERROR] {e}")
-
-import tempfile
-from multiprocessing import Process
-def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
-
-
- with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
- temp_file_path = tmp_file.name
-
- p = Process(
- target=html_to_text_sub_worker,
- args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
- )
- p.start()
- p.join(timeout)
-
- if p.is_alive():
- p.terminate()
- p.join()
-
- try:
- with open(temp_file_path, "r", encoding="utf-8") as f:
- result = f.read()
- finally:
- os.remove(temp_file_path)
-
- return result
+ text_content = get_text(html_content, config=parser_config)
+ return text_content
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):
diff --git a/changedetectionio/run_basic_tests.sh b/changedetectionio/run_basic_tests.sh
index 7e13b766..2f179fda 100755
--- a/changedetectionio/run_basic_tests.sh
+++ b/changedetectionio/run_basic_tests.sh
@@ -38,6 +38,9 @@ pytest tests/test_backend.py
pytest tests/test_rss.py
pytest tests/test_unique_lines.py
+# Try high concurrency
+FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
+
# Check file:// will pickup a file when enabled
echo "Hello world" > /tmp/test-file.txt
ALLOW_FILE_URI=yes pytest tests/test_security.py
diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py
index 9e58e201..4b240312 100644
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage):
live_server_setup(live_server)
-
- r = range(1, 30)
+ workers = int(os.getenv("FETCH_WORKERS", 10))
+ r = range(1, 10+workers)
for one in r:
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)