Automatically apply any XML/RSS namespaces
This commit is contained in:
@@ -8,6 +8,7 @@ from xml.sax.saxutils import escape as xml_escape
|
||||
import json
|
||||
import re
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
|
||||
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
|
||||
@@ -108,6 +109,20 @@ def elementpath_tostring(obj):
|
||||
|
||||
return str(obj)
|
||||
|
||||
def extract_namespaces(xml_content):
|
||||
"""
|
||||
Extracts all namespaces from the XML content.
|
||||
"""
|
||||
from lxml import etree
|
||||
from io import BytesIO
|
||||
|
||||
it = etree.iterparse(BytesIO(xml_content), events=('start-ns',))
|
||||
namespaces = {}
|
||||
for _, ns in it:
|
||||
prefix, uri = ns
|
||||
namespaces[prefix] = uri
|
||||
return namespaces
|
||||
|
||||
# Return str Utf-8 of matched rules
|
||||
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
|
||||
from lxml import etree, html
|
||||
@@ -123,7 +138,14 @@ def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False
|
||||
tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
|
||||
html_block = ""
|
||||
|
||||
r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
|
||||
# Automatically extract all namespaces from the XML content
|
||||
namespaces = {'re': 'http://exslt.org/regular-expressions'}
|
||||
try:
|
||||
namespaces.update(extract_namespaces(html_content.encode('utf-8')))
|
||||
except Exception as e:
|
||||
logger.warning(f"Problem extracting namespaces from HTMl/XML content {str(e)}")
|
||||
|
||||
r = elementpath.select(tree, xpath_filter.strip(), namespaces=namespaces, parser=XPath3Parser)
|
||||
#@note: //title/text() wont work where <title>CDATA..
|
||||
|
||||
if type(r) != list:
|
||||
|
||||
@@ -77,11 +77,12 @@ class perform_site_check(difference_detection_processor):
|
||||
|
||||
ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
|
||||
# Go into RSS preprocess for converting CDATA/comment to usable text
|
||||
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
|
||||
if '<rss' in self.fetcher.content[:100].lower():
|
||||
# Ctype_header could be unset if we are just reprocessing the existin content
|
||||
if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']) or not ctype_header:
|
||||
top_text = self.fetcher.content[:200].lower().strip()
|
||||
if '<rss' in top_text or 'search.yahoo.com/mrss/' in top_text:
|
||||
self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
|
||||
is_rss = True
|
||||
|
||||
# source: support, basically treat it as plaintext
|
||||
if watch.is_source_type_url:
|
||||
is_html = False
|
||||
|
||||
Reference in New Issue
Block a user