Merge branch 'revert-multiprocess-htmlt-to-text' into realtime-ui

This commit is contained in:
dgtlmoon
2025-05-08 18:55:04 +02:00
3 changed files with 19 additions and 50 deletions

View File

@@ -437,60 +437,26 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
import os
def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
from inscriptis import get_text
from inscriptis.model.config import ParserConfig
try:
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
else:
parser_config = None
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
if render_anchor_tag_content:
parser_config = ParserConfig(
annotation_rules={"a": ["hyperlink"]},
display_links=True
)
else:
parser_config = None
text_content = get_text(html_content, config=parser_config)
if is_rss:
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
html_content = re.sub(r'</title>', r'</h1>', html_content)
with open(temp_file_path, "w", encoding="utf-8") as f:
f.write(text_content)
except Exception as e:
# Write error to file so the parent can read it
with open(temp_file_path, "w", encoding="utf-8") as f:
f.write(f"[ERROR] {e}")
import tempfile
from multiprocessing import Process
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
temp_file_path = tmp_file.name
p = Process(
target=html_to_text_sub_worker,
args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
)
p.start()
p.join(timeout)
if p.is_alive():
p.terminate()
p.join()
try:
with open(temp_file_path, "r", encoding="utf-8") as f:
result = f.read()
finally:
os.remove(temp_file_path)
return result
text_content = get_text(html_content, config=parser_config)
return text_content
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):

View File

@@ -38,6 +38,9 @@ pytest tests/test_backend.py
pytest tests/test_rss.py
pytest tests/test_unique_lines.py
# Try high concurrency
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
# Check file:// will pickup a file when enabled
echo "Hello world" > /tmp/test-file.txt
ALLOW_FILE_URI=yes pytest tests/test_security.py

View File

@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
def test_consistent_history(client, live_server, measure_memory_usage):
live_server_setup(live_server)
r = range(1, 30)
workers = int(os.getenv("FETCH_WORKERS", 10))
r = range(1, 10+workers)
for one in r:
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)