Fix: Improve accuracy of matching album in directory structure for local filesystem...
authorMarek Skrobacki <skrobul@skrobul.com>
Wed, 20 Nov 2024 17:54:40 +0000 (17:54 +0000)
committerGitHub <noreply@github.com>
Wed, 20 Nov 2024 17:54:40 +0000 (18:54 +0100)
music_assistant/providers/filesystem_local/helpers.py
tests/providers/filesystem/test_helpers.py

index 57e87b93d935e99fd2041532470d6829c6b315eb..7e15e550d657ab3e7e0bc3089079a0f304e98613 100644 (file)
@@ -67,6 +67,48 @@ def get_artist_dir(
     return matched_dir
 
 
+def tokenize(input_str: str, delimiters: str) -> list[str]:
+    """Tokenizes the album names or paths."""
+    normalised = re.sub(delimiters, "^^^", input_str)
+    return [x for x in normalised.split("^^^") if x != ""]
+
+
+def _dir_contains_album_name(id3_album_name: str, directory_name: str) -> bool:
+    """Check if a directory name contains an album name.
+
+    This function tokenizes both input strings using different delimiters and
+    checks if the album name is a substring of the directory name.
+
+    First iteration considers the literal dash as one of the separators. The
+    second pass is to catch edge cases where the literal dash is part of the
+    album's name, not an actual separator. For example, an album like 'Aphex
+    Twin - Selected Ambient Works 85-92' would be correctly handled.
+
+    Args:
+        id3_album_name (str): The album name to search for.
+        directory_name (str): The directory name to search in.
+
+    Returns:
+        bool: True if the directory name contains the album name, False otherwise.
+    """
+    for delims in ["[-_ ]", "[_ ]"]:
+        tokenized_album_name = tokenize(id3_album_name, delims)
+        tokenized_dirname = tokenize(directory_name, delims)
+
+        # Exact match, potentially just on the album name
+        # in case artist's name is not included in id3_album_name
+        if all(token in tokenized_dirname for token in tokenized_album_name):
+            return True
+
+        if len(tokenized_album_name) <= len(tokenized_dirname) and compare_strings(
+            "".join(tokenized_album_name),
+            "".join(tokenized_dirname[0 : len(tokenized_album_name)]),
+            False,
+        ):
+            return True
+    return False
+
+
 def get_album_dir(track_dir: str, album_name: str) -> str | None:
     """Return album/parent directory of a track."""
     parentdir = track_dir
@@ -82,6 +124,20 @@ def get_album_dir(track_dir: str, album_name: str) -> str | None:
         if compare_strings(album_name, dirname.split(" - ")[-1].split("(")[0], False):
             # account for ArtistName - AlbumName (Version) format in the directory name
             return parentdir
+
+        if any(sep in dirname for sep in ["-", " ", "_"]) and album_name:
+            album_chunks = album_name.split(" - ", 1)
+            album_name_includes_artist = len(album_chunks) > 1
+            just_album_name = album_chunks[1] if album_name_includes_artist else None
+
+            # attempt matching using tokenized version of path and album name
+            # with _dir_contains_album_name()
+            if just_album_name and _dir_contains_album_name(just_album_name, dirname):
+                return parentdir
+
+            if _dir_contains_album_name(album_name, dirname):
+                return parentdir
+
         if compare_strings(album_name.split("(")[0], dirname, False):
             # account for AlbumName (Version) format in the album name
             return parentdir
index 591a36ec5ef7bedb8785945b5eae2fa95c6daace..5145d9d91a2e8c0a534f597babc1088dbd947fc8 100644 (file)
@@ -62,17 +62,40 @@ def test_get_artist_dir() -> None:
             "/home/user/Music/Aphex Twin - Selected Ambient Works 85-92 (Remastered) - WEB",
             "/home/user/Music/Aphex Twin - Selected Ambient Works 85-92 (Remastered) - WEB",
         ),
+        # Test tokenizer - dirname with extras
+        (
+            "Fokus - Prewersje",
+            "/home/user/Fokus-Prewersje-PL-WEB-FLAC-2021-PS_INT",
+            "/home/user/Fokus-Prewersje-PL-WEB-FLAC-2021-PS_INT",
+        ),
+        # Test tokenizer - dirname with version and extras
+        (
+            "Layo And Bushwacka - Night Works",
+            "/home/music/Layo_And_Bushwacka-Night_Works_(Reissue)-(XLCD_154X)-FLAC-2003",
+            "/home/music/Layo_And_Bushwacka-Night_Works_(Reissue)-(XLCD_154X)-FLAC-2003",
+        ),
+        # Test tokenizer - extras and approximate match on diacratics
+        (
+            "Łona i Webber - Wyślij Sobie Pocztówkę",
+            "/usr/others/Lona-Discography-PL-FLAC-2020-INT/Lona_I_Webber-Wyslij_Sobie_Pocztowke-PL-WEB-FLAC-2014-PS",
+            "/usr/others/Lona-Discography-PL-FLAC-2020-INT/Lona_I_Webber-Wyslij_Sobie_Pocztowke-PL-WEB-FLAC-2014-PS",
+        ),
+        (
+            "NIC",
+            "/nas/downloads/others/Sokol-NIC-PL-WEB-FLAC-2021",
+            "/nas/downloads/others/Sokol-NIC-PL-WEB-FLAC-2021",
+        ),
         # Test album (version) format
         (
-            "Selected Ambient Works 85-92",
+            "Aphex Twin - Selected Ambient Works 85-92",
             "/home/user/Music/Aphex Twin/Selected Ambient Works 85-92 (Remastered)",
             "/home/user/Music/Aphex Twin/Selected Ambient Works 85-92 (Remastered)",
         ),
         # Test album name in dir
         (
-            "Selected Ambient Works 85-92",
-            "/home/user/Music/RandomDirWithSelected Ambient Works 85-92InIt",
-            "/home/user/Music/RandomDirWithSelected Ambient Works 85-92InIt",
+            "Aphex Twin - Selected Ambient Works 85-92",
+            "/home/user/Music/RandomDirWithAphex Twin - Selected Ambient Works 85-92InIt",
+            "/home/user/Music/RandomDirWithAphex Twin - Selected Ambient Works 85-92InIt",
         ),
         # Test no match
         (