Levenshtein text similarity plugin - adding test, fixing import, fixing check for watches with 1 snapshot history

2025-04-30 15:44:29 +02:00
parent d0da8c9825
commit ea303a7bec
4 changed files with 110 additions and 13 deletions
--- a/changedetectionio/conditions/init.py
+++ b/changedetectionio/conditions/init.py
@@ -5,7 +5,7 @@ from json_logic.builtins import BUILTINS
 from .exceptions import EmptyConditionRuleRowNotUsable
 from .pluggy_interface import plugin_manager  # Import the pluggy plugin manager
 from . import default_plugin
-
+from loguru import logger
 # List of all supported JSON Logic operators
 operator_choices = [
    (None, "Choose one - Operator"),
@@ -113,12 +113,14 @@ def execute_ruleset_against_all_plugins(current_watch_uuid: str, application_dat
                            application_datastruct=application_datastruct,
                            ephemeral_data=ephemeral_data
                        )
-                        
+                        logger.debug(f"Trying plugin {plugin}....")
+
                        # Set a timeout of 10 seconds
                        try:
                            new_execute_data = future.result(timeout=10)
                            if new_execute_data and isinstance(new_execute_data, dict):
                                EXECUTE_DATA.update(new_execute_data)
+
                        except concurrent.futures.TimeoutError:
                            # The plugin took too long, abort processing for this watch
                            raise Exception(f"Plugin {plugin.__class__.__name__} took more than 10 seconds to run.")
--- a/changedetectionio/conditions/plugins/levenshtein_plugin.py
+++ b/changedetectionio/conditions/plugins/levenshtein_plugin.py
@@ -9,15 +9,20 @@ def levenshtein_ratio_recent_history(watch, incoming_text=None):
    try:
        from Levenshtein import ratio, distance
        k = list(watch.history.keys())
-        if len(k) >= 2:
-            # When called from ui_edit_stats_extras, we don't have incoming_text
-            if incoming_text is None:
-                a = watch.get_history_snapshot(timestamp=k[-1])  # Latest snapshot
-                b = watch.get_history_snapshot(timestamp=k[-2])  # Previous snapshot
-            else:
-                a = watch.get_history_snapshot(timestamp=k[-2]) # Second newest, incoming_text will be "newest"
-                b = incoming_text
-            
+        a = None
+        b = None
+
+        # When called from ui_edit_stats_extras, we don't have incoming_text
+        if incoming_text is None:
+            a = watch.get_history_snapshot(timestamp=k[-1])  # Latest snapshot
+            b = watch.get_history_snapshot(timestamp=k[-2])  # Previous snapshot
+
+        # Needs atleast one snapshot
+        elif len(k) >= 1: # Should be atleast one snapshot to compare against
+            a = watch.get_history_snapshot(timestamp=k[-1]) # Latest saved snapshot
+            b = incoming_text if incoming_text else k[-2]
+
+        if a and b:
            distance_value = distance(a, b)
            ratio_value = ratio(a, b)
            return {
@@ -53,7 +58,7 @@ def add_data(current_watch_uuid, application_datastruct, ephemeral_data):
    # ephemeral_data['text'] will be the current text after filters, they may have edited filters but not saved them yet etc

    if watch and 'text' in ephemeral_data:
-        lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data['text'])
+        lev_data = levenshtein_ratio_recent_history(watch, ephemeral_data.get('text',''))
        if isinstance(lev_data, dict):
            res['levenshtein_ratio'] = lev_data.get('ratio', 0)
            res['levenshtein_similarity'] = lev_data.get('percent_similar', 0)
--- a/changedetectionio/tests/test_conditions.py
+++ b/changedetectionio/tests/test_conditions.py
@@ -235,4 +235,92 @@ def test_wordcount_conditions_plugin(client, live_server, measure_memory_usage):
    )

    # Assert the word count is counted correctly
-    assert b'<td>13</td>' in res.data
+    assert b'<td>13</td>' in res.data
+
+
+
+# If there was only a change in the whitespacing, then we shouldnt have a change detected
+def test_lev_conditions_plugin(client, live_server, measure_memory_usage):
+    live_server_setup(live_server)
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("""<html>
+       <body>
+     Some initial text<br>
+     <p>Which is across multiple lines</p>
+     <br>
+     So let's see what happens.  <br>
+     </body>
+     </html>
+    """)
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', _external=True)
+    res = client.post(
+        url_for("imports.import_page"),
+        data={"urls": test_url},
+        follow_redirects=True
+    )
+    assert b"1 Imported" in res.data
+
+    uuid = next(iter(live_server.app.config['DATASTORE'].data['watching']))
+    # Give the thread time to pick it up
+    wait_for_all_checks(client)
+    res = client.post(
+        url_for("ui.ui_edit.edit_page", uuid=uuid),
+        data={
+            "url": test_url,
+            "fetch_backend": "html_requests",
+            "conditions_match_logic": "ALL",  # ALL = AND logic
+            "conditions-0-field": "levenshtein_ratio",
+            "conditions-0-operator": "<",
+            "conditions-0-value": "0.8" # needs to be more of a diff to trigger a change
+        },
+        follow_redirects=True
+    )
+    assert b"Updated watch." in res.data
+
+    res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+    assert b'Queued 1 watch for rechecking.' in res.data
+    wait_for_all_checks(client)
+    res = client.get(url_for("watchlist.index"))
+    assert b'unviewed' not in res.data
+
+
+    ############### Now change it a LITTLE bit...
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write("""<html>
+       <body>
+     Some initial text<br>
+     <p>Which is across multiple lines</p>
+     <br>
+     So let's see what happenxxxxxxxxx.  <br>
+     </body>
+     </html>
+    """)
+
+    res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+    assert b'Queued 1 watch for rechecking.' in res.data
+    wait_for_all_checks(client)
+
+    res = client.get(url_for("watchlist.index"))
+    assert b'unviewed' not in res.data #because this will be like 0.90 not 0.8 threshold
+
+    ############### Now change it a MORE THAN 50%
+    test_return_data = """<html>
+       <body>
+     Some sxxxx<br>
+     <p>Which is across a lines</p>
+     <br>
+     ok.  <br>
+     </body>
+     </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+    res = client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+    assert b'Queued 1 watch for rechecking.' in res.data
+    wait_for_all_checks(client)
+    res = client.get(url_for("watchlist.index"))
+    assert b'unviewed' in res.data
--- a/requirements.txt
+++ b/requirements.txt
@@ -90,6 +90,8 @@ extruct
 # For cleaning up unknown currency formats
 babel

+levenshtein
+
 # Needed for > 3.10, https://github.com/microsoft/playwright-python/issues/2096
 greenlet >= 3.0.3