Merge branch 'revert-multiprocess-htmlt-to-text' into realtime-ui
This commit is contained in:
@@ -436,55 +436,61 @@ def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False
|
|||||||
return re.sub(pattern, repl, html_content)
|
return re.sub(pattern, repl, html_content)
|
||||||
|
|
||||||
|
|
||||||
def html_to_text_sub_worker(conn, html_content: str, render_anchor_tag_content=False, is_rss=False):
|
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
||||||
|
import os
|
||||||
|
|
||||||
|
def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
|
||||||
from inscriptis import get_text
|
from inscriptis import get_text
|
||||||
from inscriptis.model.config import ParserConfig
|
from inscriptis.model.config import ParserConfig
|
||||||
|
try:
|
||||||
|
if render_anchor_tag_content:
|
||||||
|
parser_config = ParserConfig(
|
||||||
|
annotation_rules={"a": ["hyperlink"]},
|
||||||
|
display_links=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
parser_config = None
|
||||||
|
|
||||||
"""Converts html string to a string with just the text. If ignoring
|
if is_rss:
|
||||||
rendering anchor tag content is enable, anchor tag content are also
|
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
||||||
included in the text
|
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
||||||
|
|
||||||
:param html_content: string with html content
|
text_content = get_text(html_content, config=parser_config)
|
||||||
:param render_anchor_tag_content: boolean flag indicating whether to extract
|
|
||||||
hyperlinks (the anchor tag content) together with text. This refers to the
|
|
||||||
'href' inside 'a' tags.
|
|
||||||
Anchor tag content is rendered in the following manner:
|
|
||||||
'[ text ](anchor tag content)'
|
|
||||||
:return: extracted text from the HTML
|
|
||||||
"""
|
|
||||||
# if anchor tag content flag is set to True define a config for
|
|
||||||
# extracting this content
|
|
||||||
if render_anchor_tag_content:
|
|
||||||
parser_config = ParserConfig(
|
|
||||||
annotation_rules={"a": ["hyperlink"]},
|
|
||||||
display_links=True
|
|
||||||
)
|
|
||||||
# otherwise set config to None/default
|
|
||||||
else:
|
|
||||||
parser_config = None
|
|
||||||
|
|
||||||
# RSS Mode - Inscriptis will treat `title` as something else.
|
with open(temp_file_path, "w", encoding="utf-8") as f:
|
||||||
# Make it as a regular block display element (//item/title)
|
f.write(text_content)
|
||||||
# This is a bit of a hack - the real way it to use XSLT to convert it to HTML #1874
|
|
||||||
if is_rss:
|
|
||||||
html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
|
|
||||||
html_content = re.sub(r'</title>', r'</h1>', html_content)
|
|
||||||
|
|
||||||
text_content = get_text(html_content, config=parser_config)
|
except Exception as e:
|
||||||
conn.send(text_content)
|
# Write error to file so the parent can read it
|
||||||
conn.close()
|
with open(temp_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(f"[ERROR] {e}")
|
||||||
|
|
||||||
# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
|
import tempfile
|
||||||
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False):
|
from multiprocessing import Process
|
||||||
from multiprocessing import Process, Pipe
|
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:
|
||||||
|
|
||||||
parent_conn, child_conn = Pipe()
|
|
||||||
p = Process(target=html_to_text_sub_worker, args=(child_conn, html_content, render_anchor_tag_content, is_rss))
|
with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
|
||||||
|
temp_file_path = tmp_file.name
|
||||||
|
|
||||||
|
p = Process(
|
||||||
|
target=html_to_text_sub_worker,
|
||||||
|
args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
|
||||||
|
)
|
||||||
p.start()
|
p.start()
|
||||||
text = parent_conn.recv()
|
p.join(timeout)
|
||||||
p.join()
|
|
||||||
return text
|
if p.is_alive():
|
||||||
|
p.terminate()
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(temp_file_path, "r", encoding="utf-8") as f:
|
||||||
|
result = f.read()
|
||||||
|
finally:
|
||||||
|
os.remove(temp_file_path)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
|
||||||
def has_ldjson_product_info(content):
|
def has_ldjson_product_info(content):
|
||||||
|
|||||||
@@ -59,8 +59,8 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
assert len(tmp_history) == 1, "History.txt should contain 1 line"
|
assert len(tmp_history) == 1, "History.txt should contain 1 line"
|
||||||
|
|
||||||
# Should be two files,. the history.txt , and the snapshot.txt
|
# Should be two files,. the history.txt , and the snapshot.txt
|
||||||
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path,
|
files_in_watch_dir = os.listdir(os.path.join(live_server.app.config['DATASTORE'].datastore_path, w))
|
||||||
w))
|
|
||||||
# Find the snapshot one
|
# Find the snapshot one
|
||||||
for fname in files_in_watch_dir:
|
for fname in files_in_watch_dir:
|
||||||
if fname != 'history.txt' and 'html' not in fname:
|
if fname != 'history.txt' and 'html' not in fname:
|
||||||
@@ -76,7 +76,6 @@ def test_consistent_history(client, live_server, measure_memory_usage):
|
|||||||
|
|
||||||
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
|
assert len(files_in_watch_dir) == 3, "Should be just three files in the dir, html.br snapshot, history.txt and the extracted text snapshot"
|
||||||
|
|
||||||
|
|
||||||
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
|
json_db_file = os.path.join(live_server.app.config['DATASTORE'].datastore_path, 'url-watches.json')
|
||||||
with open(json_db_file, 'r') as f:
|
with open(json_db_file, 'r') as f:
|
||||||
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
|
assert '"default"' not in f.read(), "'default' probably shouldnt be here, it came from when the 'default' Watch vars were accidently being saved"
|
||||||
|
|||||||
Reference in New Issue
Block a user