Filters - Support multi line regex (#2889)
This commit is contained in:
@@ -366,22 +366,41 @@ def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None
|
|||||||
# wordlist - list of regex's (str) or words (str)
|
# wordlist - list of regex's (str) or words (str)
|
||||||
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
|
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
|
||||||
def strip_ignore_text(content, wordlist, mode="content"):
|
def strip_ignore_text(content, wordlist, mode="content"):
|
||||||
i = 0
|
|
||||||
output = []
|
|
||||||
ignore_text = []
|
ignore_text = []
|
||||||
ignore_regex = []
|
ignore_regex = []
|
||||||
ignored_line_numbers = []
|
ignore_regex_multiline = []
|
||||||
|
ignored_lines = []
|
||||||
|
|
||||||
for k in wordlist:
|
for k in wordlist:
|
||||||
# Is it a regex?
|
# Is it a regex?
|
||||||
res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
|
res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
|
||||||
if res:
|
if res:
|
||||||
ignore_regex.append(re.compile(perl_style_slash_enclosed_regex_to_options(k)))
|
res = re.compile(perl_style_slash_enclosed_regex_to_options(k))
|
||||||
|
if res.flags & re.DOTALL or res.flags & re.MULTILINE:
|
||||||
|
ignore_regex_multiline.append(res)
|
||||||
|
else:
|
||||||
|
ignore_regex.append(res)
|
||||||
else:
|
else:
|
||||||
ignore_text.append(k.strip())
|
ignore_text.append(k.strip())
|
||||||
|
|
||||||
for line in content.splitlines(keepends=True):
|
for r in ignore_regex_multiline:
|
||||||
i += 1
|
for match in r.finditer(content):
|
||||||
|
content_lines = content[:match.end()].splitlines(keepends=True)
|
||||||
|
match_lines = content[match.start():match.end()].splitlines(keepends=True)
|
||||||
|
|
||||||
|
end_line = len(content_lines)
|
||||||
|
start_line = end_line - len(match_lines)
|
||||||
|
|
||||||
|
if end_line - start_line <= 1:
|
||||||
|
# Match is empty or in the middle of the line
|
||||||
|
ignored_lines.append(start_line)
|
||||||
|
else:
|
||||||
|
for i in range(start_line, end_line):
|
||||||
|
ignored_lines.append(i)
|
||||||
|
|
||||||
|
line_index = 0
|
||||||
|
lines = content.splitlines(keepends=True)
|
||||||
|
for line in lines:
|
||||||
# Always ignore blank lines in this mode. (when this function gets called)
|
# Always ignore blank lines in this mode. (when this function gets called)
|
||||||
got_match = False
|
got_match = False
|
||||||
for l in ignore_text:
|
for l in ignore_text:
|
||||||
@@ -393,17 +412,19 @@ def strip_ignore_text(content, wordlist, mode="content"):
|
|||||||
if r.search(line):
|
if r.search(line):
|
||||||
got_match = True
|
got_match = True
|
||||||
|
|
||||||
if not got_match:
|
if got_match:
|
||||||
# Not ignored, and should preserve "keepends"
|
ignored_lines.append(line_index)
|
||||||
output.append(line)
|
|
||||||
else:
|
line_index += 1
|
||||||
ignored_line_numbers.append(i)
|
|
||||||
|
ignored_lines = set([i for i in ignored_lines if i >= 0 and i < len(lines)])
|
||||||
|
|
||||||
# Used for finding out what to highlight
|
# Used for finding out what to highlight
|
||||||
if mode == "line numbers":
|
if mode == "line numbers":
|
||||||
return ignored_line_numbers
|
return [i + 1 for i in ignored_lines]
|
||||||
|
|
||||||
return ''.join(output)
|
output_lines = set(range(len(lines))) - ignored_lines
|
||||||
|
return ''.join([lines[i] for i in output_lines])
|
||||||
|
|
||||||
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
|
||||||
from xml.sax.saxutils import escape as xml_escape
|
from xml.sax.saxutils import escape as xml_escape
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ def test_strip_regex_text_func():
|
|||||||
]
|
]
|
||||||
|
|
||||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
|
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines)
|
||||||
|
|
||||||
assert "but 1 lines" in stripped_content
|
assert "but 1 lines" in stripped_content
|
||||||
assert "igNORe-cAse text" not in stripped_content
|
assert "igNORe-cAse text" not in stripped_content
|
||||||
assert "but 1234 lines" not in stripped_content
|
assert "but 1234 lines" not in stripped_content
|
||||||
@@ -43,6 +42,46 @@ def test_strip_regex_text_func():
|
|||||||
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
|
stripped_content = html_tools.strip_ignore_text(test_content, ignore_lines, mode="line numbers")
|
||||||
assert stripped_content == [2, 5, 6, 7, 8, 10]
|
assert stripped_content == [2, 5, 6, 7, 8, 10]
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'])
|
||||||
|
assert "but 1 lines" not in stripped_content
|
||||||
|
assert "skip 5 lines" not in stripped_content
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/but 1.+5 lines/s'], mode="line numbers")
|
||||||
|
assert stripped_content == [4, 5]
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'])
|
||||||
|
assert stripped_content == ""
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/.+/s'], mode="line numbers")
|
||||||
|
assert stripped_content == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'])
|
||||||
|
assert "but 1 lines" not in stripped_content
|
||||||
|
assert "skip 5 lines" not in stripped_content
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+but.+\\n.+lines$/m'], mode="line numbers")
|
||||||
|
assert stripped_content == [4, 5]
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'])
|
||||||
|
assert "but sometimes we want to remove the lines." not in stripped_content
|
||||||
|
assert "but not always." not in stripped_content
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/^.+?\.$/m'], mode="line numbers")
|
||||||
|
assert stripped_content == [2, 11]
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'])
|
||||||
|
assert "but sometimes we want to remove the lines." not in stripped_content
|
||||||
|
assert "but 1 lines" not in stripped_content
|
||||||
|
assert "but 1234 lines" not in stripped_content
|
||||||
|
assert "igNORe-cAse text we dont want to keep" not in stripped_content
|
||||||
|
assert "but not always." not in stripped_content
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text(test_content, ['/but.+?but/ms'], mode="line numbers")
|
||||||
|
assert stripped_content == [2, 3, 4, 9, 10, 11]
|
||||||
|
|
||||||
|
stripped_content = html_tools.strip_ignore_text("\n\ntext\n\ntext\n\n", ['/^$/ms'], mode="line numbers")
|
||||||
|
assert stripped_content == [1, 2, 4, 6]
|
||||||
|
|
||||||
# Check that linefeeds are preserved when there are is no matching ignores
|
# Check that linefeeds are preserved when there are is no matching ignores
|
||||||
content = "some text\n\nand other text\n"
|
content = "some text\n\nand other text\n"
|
||||||
stripped_content = html_tools.strip_ignore_text(content, ignore_lines)
|
stripped_content = html_tools.strip_ignore_text(content, ignore_lines)
|
||||||
|
|||||||
Reference in New Issue
Block a user