Merge branch 'revert-multiprocess-htmlt-to-text' into realtime-ui
This commit is contained in:
@@ -437,60 +437,26 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
|||||||
|
|
||||||
|
|
||||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||||
import os
|
|
||||||
|
|
||||||
def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
|
|
||||||
|
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
|
||||||
from inscriptis import get_text
|
from inscriptis import get_text
|
||||||
from inscriptis.model.config import ParserConfig
|
from inscriptis.model.config import ParserConfig
|
||||||
try:
|
|
||||||
if render_anchor_tag_content:
|
|
||||||
parser_config = ParserConfig(
|
|
||||||
annotation_rules={"a": ["hyperlink"]},
|
|
||||||
display_links=True
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
parser_config = None
|
|
||||||
|
|
||||||
if is_rss:
|
if render_anchor_tag_content:
|
||||||
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
parser_config = ParserConfig(
|
||||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
annotation_rules={"a": ["hyperlink"]},
|
||||||
|
display_links=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
parser_config = None
|
||||||
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
if is_rss:
|
||||||
|
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||||
|
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||||
|
|
||||||
with open(temp_file_path, "w", encoding="utf-8") as f:
|
text_content = get_text(html_content, config=parser_config)
|
||||||
f.write(text_content)
|
return text_content
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
# Write error to file so the parent can read it
|
|
||||||
with open(temp_file_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(f"[ERROR] {e}")
|
|
||||||
|
|
||||||
import tempfile
|
|
||||||
from multiprocessing import Process
|
|
||||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
|
|
||||||
|
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
|
|
||||||
temp_file_path = tmp_file.name
|
|
||||||
|
|
||||||
p = Process(
|
|
||||||
target=html_to_text_sub_worker,
|
|
||||||
args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
|
|
||||||
)
|
|
||||||
p.start()
|
|
||||||
p.join(timeout)
|
|
||||||
|
|
||||||
if p.is_alive():
|
|
||||||
p.terminate()
|
|
||||||
p.join()
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(temp_file_path, "r", encoding="utf-8") as f:
|
|
||||||
result = f.read()
|
|
||||||
finally:
|
|
||||||
os.remove(temp_file_path)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||||
def has_ldjson_product_info(content):
|
def has_ldjson_product_info(content):
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ pytest tests/test_backend.py
|
|||||||
pytest tests/test_rss.py
|
pytest tests/test_rss.py
|
||||||
pytest tests/test_unique_lines.py
|
pytest tests/test_unique_lines.py
|
||||||
|
|
||||||
|
# Try high concurrency
|
||||||
|
FETCH_WORKERS=130 pytest tests/test_history_consistency.py -v -l
|
||||||
|
|
||||||
# Check file:// will pickup a file when enabled
|
# Check file:// will pickup a file when enabled
|
||||||
echo "Hello world" > /tmp/test-file.txt
|
echo "Hello world" > /tmp/test-file.txt
|
||||||
ALLOW_FILE_URI=yes pytest tests/test_security.py
|
ALLOW_FILE_URI=yes pytest tests/test_security.py
|
||||||
|
|||||||
@@ -10,8 +10,8 @@ from urllib.parse import urlparse, parse_qs
|
|||||||
|
|
||||||
def test_consistent_history(client, live_server, measure_memory_usage):
|
def test_consistent_history(client, live_server, measure_memory_usage):
|
||||||
live_server_setup(live_server)
|
live_server_setup(live_server)
|
||||||
|
workers = int(os.getenv("FETCH_WORKERS", 10))
|
||||||
r = range(1, 30)
|
r = range(1, 10+workers)
|
||||||
|
|
||||||
for one in r:
|
for one in r:
|
||||||
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
|
test_url = url_for('test_endpoint', content_type="text/html", content=str(one), _external=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user