Better handling of missing or malformed ID3 tags (#434)
authorMarcel van der Veldt <m.vanderveldt@outlook.com>
Mon, 25 Jul 2022 12:04:55 +0000 (14:04 +0200)
committerGitHub <noreply@github.com>
Mon, 25 Jul 2022 12:04:55 +0000 (14:04 +0200)
* Better handling of missing or malformed ID3 tags

* Add tests for parser logic

* Enforce Various Artists name to be consistent

14 files changed:
.github/workflows/test.yml
music_assistant/constants.py
music_assistant/controllers/music/albums.py
music_assistant/controllers/music/artists.py
music_assistant/controllers/music/playlists.py
music_assistant/controllers/music/radio.py
music_assistant/helpers/tags.py
music_assistant/helpers/util.py
music_assistant/music_providers/filesystem.py
requirements_dev.txt
tests/fixtures/MyArtist - MyTitle without Tags.mp3 [new file with mode: 0644]
tests/fixtures/MyArtist - MyTitle.mp3 [new file with mode: 0644]
tests/fixtures/test.mp3 [new file with mode: 0644]
tests/test_tags.py [new file with mode: 0644]

index 9eca9d5f01fff9f16df606025ef360892a1fd638..8030183460381e6830004c1f6f2cd406403cffe8 100644 (file)
@@ -24,6 +24,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
+          apt-get update
+          apt-get install ffmpeg
           python -m pip install --upgrade pip
           pip install -r requirements_dev.txt
           pre-commit install-hooks
index 72168bb2a6fc13644afb6db612672faf9911da4c..aa55eef267a9e1720c7be7050fbe567ec91547dc 100755 (executable)
@@ -1,3 +1,7 @@
 """All constants for Music Assistant."""
 
 ROOT_LOGGER_NAME = "music_assistant"
+
+UNKNOWN_ARTIST = "Unknown Artist"
+VARIOUS_ARTISTS = "Various Artists"
+VARIOUS_ARTISTS_ID = "89ad4ac3-39f7-470e-963a-56509c546377"
index 8bb1aa25d130c2c95556dd6b674221db4df7e157..76575cdd6db40fe0393050f9734af8ba61941235 100644 (file)
@@ -4,10 +4,10 @@ from __future__ import annotations
 import asyncio
 from typing import List, Optional, Union
 
+from music_assistant.constants import VARIOUS_ARTISTS
 from music_assistant.helpers.compare import compare_album, loose_compare_strings
 from music_assistant.helpers.database import TABLE_ALBUMS, TABLE_TRACKS
 from music_assistant.helpers.json import json_serializer
-from music_assistant.helpers.tags import FALLBACK_ARTIST
 from music_assistant.models.enums import EventType, MusicProviderFeature, ProviderType
 from music_assistant.models.errors import MediaNotFoundError
 from music_assistant.models.event import MassEvent
@@ -352,7 +352,7 @@ class AlbumsController(MediaControllerBase[Album]):
         # use intermediate set to prevent duplicates
         # filter various artists if multiple artists
         if len(album_artists) > 1:
-            album_artists = {x for x in album_artists if x.name != FALLBACK_ARTIST}
+            album_artists = {x for x in album_artists if (x.name != VARIOUS_ARTISTS)}
         return list(album_artists)
 
     async def _get_artist_mapping(
index 489a3d2dec3514dc0c4a741bec7666f4c0525489..b973ac90306fa87a875d4f88ceed18725e11f081 100644 (file)
@@ -5,6 +5,8 @@ import itertools
 from time import time
 from typing import Any, Dict, List, Optional
 
+from music_assistant.constants import VARIOUS_ARTISTS, VARIOUS_ARTISTS_ID
+from music_assistant.helpers.compare import compare_strings
 from music_assistant.helpers.database import TABLE_ALBUMS, TABLE_ARTISTS, TABLE_TRACKS
 from music_assistant.helpers.json import json_serializer
 from music_assistant.models.enums import EventType, MusicProviderFeature, ProviderType
@@ -223,6 +225,12 @@ class ArtistsController(MediaControllerBase[Artist]):
         """Add a new item record to the database."""
         assert isinstance(item, Artist), "Not a full Artist object"
         assert item.provider_ids, "Artist is missing provider id(s)"
+        # enforce various artists name + id
+        if compare_strings(item.name, VARIOUS_ARTISTS):
+            item.musicbrainz_id = VARIOUS_ARTISTS_ID
+        if item.musicbrainz_id == VARIOUS_ARTISTS_ID:
+            item.name = VARIOUS_ARTISTS
+
         async with self._db_add_lock:
             # always try to grab existing item by musicbrainz_id
             cur_item = None
@@ -274,6 +282,12 @@ class ArtistsController(MediaControllerBase[Artist]):
             metadata = cur_item.metadata.update(item.metadata, item.provider.is_file())
             provider_ids = {*cur_item.provider_ids, *item.provider_ids}
 
+        # enforce various artists name + id
+        if compare_strings(item.name, VARIOUS_ARTISTS):
+            item.musicbrainz_id = VARIOUS_ARTISTS_ID
+        if item.musicbrainz_id == VARIOUS_ARTISTS_ID:
+            item.name = VARIOUS_ARTISTS
+
         await self.mass.database.update(
             self.db_table,
             {"item_id": item_id},
index 018578e31ce201b4284f3972c07be2c872d506fa..2656d570abf854ac94874e07fd890da89291bd02 100644 (file)
@@ -259,6 +259,7 @@ class PlaylistController(MediaControllerBase[Playlist]):
             self.db_table,
             {"item_id": item_id},
             {
+                # always prefer name/owner from updated item here
                 "name": item.name,
                 "sort_name": item.sort_name,
                 "owner": item.owner,
index 0791c6a0925391a73b431729a8301922c7a5e72a..a64aa2fcdb59c109eed70b4b0bc388e2a09f5fcc 100644 (file)
@@ -105,6 +105,7 @@ class RadioController(MediaControllerBase[Radio]):
             self.db_table,
             match,
             {
+                # always prefer name from updated item here
                 "name": item.name,
                 "sort_name": item.sort_name,
                 "metadata": json_serializer(metadata),
index d3ce7da9e54e5bf43171ee17400d3e1c455c18e6..3dbdfda12d9fc6df8304d268ae02c3fdab95a584 100644 (file)
@@ -4,31 +4,43 @@ from __future__ import annotations
 import json
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 
 from requests import JSONDecodeError
 
+from music_assistant.constants import UNKNOWN_ARTIST
 from music_assistant.helpers.process import AsyncProcess
+from music_assistant.helpers.util import try_parse_int
 from music_assistant.models.errors import InvalidDataError
 
-FALLBACK_ARTIST = "Various Artists"
-
-# allowed splitters for titles and artists strings
-# NOTE: do not use '&' or '/' as splitter here as it will cause issues with artists
-# actually having that in the name
-SPLITTERS = (";", ",", "Featuring", " Feat. ", " Feat ", "feat.")
+# the only multi-item splitter we accept is the semicolon,
+# which is also the default in Musicbrainz Picard.
+# the slash is also a common splitter but causes colissions with
+# artists actually containing a slash in the name, such as ACDC
+TAG_SPLITTER = ";"
 
 
 def split_items(org_str: str) -> Tuple[str]:
     """Split up a tags string by common splitter."""
-    if isinstance(org_str, list):
-        return org_str
     if not org_str:
         return tuple()
-    for splitter in SPLITTERS:
-        if splitter in org_str:
-            return tuple((x.strip() for x in org_str.split(splitter)))
-    return (org_str,)
+    if isinstance(org_str, list):
+        return org_str
+    return tuple(x.strip() for x in org_str.split(TAG_SPLITTER))
+
+
+def split_artists(org_artists: Union[str, Tuple[str]]) -> Tuple[str]:
+    """Parse all artists from a string."""
+    final_artists = set()
+    # when not using the multi artist tag, the artist string may contain
+    # multiple artistsin freeform, even featuring artists may be included in this
+    # string. Try to parse the featuring artists and seperate them.
+    splitters = ("featuring", " feat. ", " feat ", "feat.")
+    for item in split_items(org_artists):
+        for splitter in splitters:
+            for subitem in item.split(splitter):
+                final_artists.add(subitem.strip())
+    return tuple(final_artists)
 
 
 @dataclass
@@ -46,18 +58,6 @@ class AudioTags:
     has_cover_image: bool
     filename: str
 
-    @property
-    def artist(self) -> str:
-        """Return artist tag (as-is)."""
-        if tag := self.tags.get("artist"):
-            return tag
-        # fallback to parsing from filename
-        title = self.filename.rsplit(os.sep, 1)[-1].split(".")[0]
-        title_parts = title.split(" - ")
-        if len(title_parts) >= 2:
-            return title_parts[0].strip()
-        return FALLBACK_ARTIST
-
     @property
     def title(self) -> str:
         """Return title tag (as-is)."""
@@ -65,9 +65,10 @@ class AudioTags:
             return tag
         # fallback to parsing from filename
         title = self.filename.rsplit(os.sep, 1)[-1].split(".")[0]
-        title_parts = title.split(" - ")
-        if len(title_parts) >= 2:
-            return title_parts[1].strip()
+        if " - " in title:
+            title_parts = title.split(" - ")
+            if len(title_parts) >= 2:
+                return title_parts[1].strip()
         return title
 
     @property
@@ -78,41 +79,63 @@ class AudioTags:
     @property
     def artists(self) -> Tuple[str]:
         """Return track artists."""
-        return split_items(self.artist)
+        # prefer multi-artist tag
+        if tag := self.tags.get("artists"):
+            return split_items(tag)
+        # fallback to regular artist string
+        if tag := self.tags.get("artist"):
+            if ";" in tag:
+                return split_items(tag)
+            return split_artists(tag)
+        # fallback to parsing from filename
+        title = self.filename.rsplit(os.sep, 1)[-1].split(".")[0]
+        if " - " in title:
+            title_parts = title.split(" - ")
+            if len(title_parts) >= 2:
+                return split_artists(title_parts[0])
+        return (UNKNOWN_ARTIST,)
 
     @property
     def album_artists(self) -> Tuple[str]:
         """Return (all) album artists (if any)."""
-        return split_items(self.tags.get("albumartist"))
+        # prefer multi-artist tag
+        if tag := self.tags.get("albumartists"):
+            return split_items(tag)
+        # fallback to regular artist string
+        if tag := self.tags.get("albumartist"):
+            if ";" in tag:
+                return split_items(tag)
+            return split_artists(tag)
+        return tuple()
 
     @property
     def genres(self) -> Tuple[str]:
         """Return (all) genres, if any."""
-        return split_items(self.tags.get("genre", ""))
+        return split_items(self.tags.get("genre"))
 
     @property
     def disc(self) -> int | None:
         """Return disc tag if present."""
         if tag := self.tags.get("disc"):
-            return int(tag.split("/")[0])
+            return try_parse_int(tag.split("/")[0], None)
         return None
 
     @property
     def track(self) -> int | None:
         """Return track tag if present."""
         if tag := self.tags.get("track"):
-            return int(tag.split("/")[0])
+            return try_parse_int(tag.split("/")[0], None)
         return None
 
     @property
     def year(self) -> int | None:
         """Return album's year if present, parsed from date."""
         if tag := self.tags.get("originalyear"):
-            return int(tag.split("-")[0])
-        if tag := self.tags.get("otiginaldate"):
-            return int(tag.split("-")[0])
+            return try_parse_int(tag.split("-")[0], None)
+        if tag := self.tags.get("originaldate"):
+            return try_parse_int(tag.split("-")[0], None)
         if tag := self.tags.get("date"):
-            return int(tag.split("-")[0])
+            return try_parse_int(tag.split("-")[0], None)
         return None
 
     @property
index 709311533ff0229867de051d168937096cbf0d4e..3b18a9f332bfd8ad7c2cfcec40d06c38ab0cdca9 100755 (executable)
@@ -19,38 +19,31 @@ CALLBACK_TYPE = Callable[[], None]
 # pylint: enable=invalid-name
 
 
-def filename_from_string(string):
+def filename_from_string(string: str) -> str:
     """Create filename from unsafe string."""
     keepcharacters = (" ", ".", "_")
     return "".join(c for c in string if c.isalnum() or c in keepcharacters).rstrip()
 
 
-def try_parse_int(possible_int):
+def try_parse_int(possible_int: Any, default: Optional[int] = 0) -> Optional[int]:
     """Try to parse an int."""
     try:
         return int(possible_int)
     except (TypeError, ValueError):
-        return 0
+        return default
 
 
-async def iter_items(items):
-    """Fake async iterator for compatability reasons."""
-    if not isinstance(items, list):
-        yield items
-    else:
-        for item in items:
-            yield item
-
-
-def try_parse_float(possible_float):
+def try_parse_float(
+    possible_float: Any, default: Optional[float] = 0.0
+) -> Optional[float]:
     """Try to parse a float."""
     try:
         return float(possible_float)
     except (TypeError, ValueError):
-        return 0.0
+        return default
 
 
-def try_parse_bool(possible_bool):
+def try_parse_bool(possible_bool: Any) -> str:
     """Try to parse a bool."""
     if isinstance(possible_bool, bool):
         return possible_bool
index 78c5f7b447aed205f03f1a7755d673f1174c8921..590cf78299d5a83353025ccf6a8f76f9bdfb5add 100644 (file)
@@ -15,8 +15,9 @@ import xmltodict
 from aiofiles.os import wrap
 from aiofiles.threadpool.binary import AsyncFileIO
 
+from music_assistant.constants import VARIOUS_ARTISTS, VARIOUS_ARTISTS_ID
 from music_assistant.helpers.compare import compare_strings
-from music_assistant.helpers.tags import FALLBACK_ARTIST, parse_tags, split_items
+from music_assistant.helpers.tags import parse_tags, split_items
 from music_assistant.helpers.util import create_safe_string, parse_title_and_version
 from music_assistant.models.enums import MusicProviderFeature, ProviderType
 from music_assistant.models.errors import MediaNotFoundError, MusicAssistantError
@@ -494,9 +495,9 @@ class FileSystemProvider(MusicProvider):
                 self.logger.warning(
                     "%s is missing ID3 tag [albumartist], using %s as fallback",
                     track_path,
-                    FALLBACK_ARTIST,
+                    VARIOUS_ARTISTS,
                 )
-                album_artists = [await self._parse_artist(name=FALLBACK_ARTIST)]
+                album_artists = [await self._parse_artist(name=VARIOUS_ARTISTS)]
 
             track.album = await self._parse_album(
                 tags.album,
@@ -633,6 +634,9 @@ class FileSystemProvider(MusicProvider):
             provider_ids={
                 MediaItemProviderId(artist_item_id, self.type, self.id, url=artist_path)
             },
+            musicbrainz_id=VARIOUS_ARTISTS_ID
+            if compare_strings(name, VARIOUS_ARTISTS)
+            else None,
         )
 
         if not await self.exists(artist_path):
index c0049194a81388303898f27e1a30d358104ac760..754759f58fd2fcb4aaaec89c7814541f27347dc9 100644 (file)
@@ -4,7 +4,13 @@ flake8==4.0.1
 mypy==0.961
 pydocstyle==6.1.1
 pylint==2.14.5
-pytest==7.1.2
+pytest-aiohttp==0.3.0
 pytest-cov==3.0.0
+pytest-freezegun==0.4.2
+pytest-socket==0.5.1
+pytest-test-groups==1.0.3
+pytest-sugar==0.9.4
 pytest-timeout==2.1.0
+pytest-xdist==2.5.0
+pytest==7.1.2
 pre-commit==2.20.0
diff --git a/tests/fixtures/MyArtist - MyTitle without Tags.mp3 b/tests/fixtures/MyArtist - MyTitle without Tags.mp3
new file mode 100644 (file)
index 0000000..6be8c43
Binary files /dev/null and b/tests/fixtures/MyArtist - MyTitle without Tags.mp3 differ
diff --git a/tests/fixtures/MyArtist - MyTitle.mp3 b/tests/fixtures/MyArtist - MyTitle.mp3
new file mode 100644 (file)
index 0000000..f1b5988
Binary files /dev/null and b/tests/fixtures/MyArtist - MyTitle.mp3 differ
diff --git a/tests/fixtures/test.mp3 b/tests/fixtures/test.mp3
new file mode 100644 (file)
index 0000000..6be8c43
Binary files /dev/null and b/tests/fixtures/test.mp3 differ
diff --git a/tests/test_tags.py b/tests/test_tags.py
new file mode 100644 (file)
index 0000000..8705200
--- /dev/null
@@ -0,0 +1,75 @@
+"""Tests for parsing ID3 tags functions."""
+
+import pathlib
+
+from music_assistant.helpers import tags
+
+RESOURCES_DIR = pathlib.Path(__file__).parent.resolve().joinpath("fixtures")
+
+FILE_1 = str(RESOURCES_DIR.joinpath("MyArtist - MyTitle.mp3"))
+
+
+async def test_parse_metadata_from_id3tags():
+    """Test parsing of parsing metadata from ID3 tags."""
+    filename = str(RESOURCES_DIR.joinpath("MyArtist - MyTitle.mp3"))
+    _tags = await tags.parse_tags(filename)
+    assert _tags.album == "MyAlbum"
+    assert _tags.title == "MyTitle"
+    assert _tags.duration == 1
+    assert _tags.album_artists == ("MyArtist",)
+    assert _tags.artists == ("MyArtist", "MyArtist2")
+    assert _tags.genres == ("Genre1", "Genre2")
+    assert _tags.musicbrainz_albumartistids == ("abcdefg",)
+    assert _tags.musicbrainz_artistids == ("abcdefg",)
+    assert _tags.musicbrainz_releasegroupid == "abcdefg"
+    assert _tags.musicbrainz_trackid == "abcdefg"
+    # test parsing disc/track number
+    _tags.tags["disc"] = "1"
+    assert _tags.disc == 1
+    _tags.tags["disc"] = "1/1"
+    assert _tags.disc == 1
+    _tags.tags["disc"] = ""
+    assert _tags.disc is None
+    # test parsing album year
+    _tags.tags["date"] = "2022"
+    assert _tags.year == 2022
+    _tags.tags["date"] = "2022-05-05"
+    assert _tags.year == 2022
+    _tags.tags["date"] = "blah"
+    assert _tags.year is None
+    _tags.tags["date"] = ""
+    assert _tags.year is None
+    _tags.tags.pop("date", None)
+    assert _tags.year is None
+
+
+async def test_parse_metadata_from_filename():
+    """Test parsing of parsing metadata from filename."""
+    filename = str(RESOURCES_DIR.joinpath("MyArtist - MyTitle without Tags.mp3"))
+    _tags = await tags.parse_tags(filename)
+    assert _tags.album is None
+    assert _tags.title == "MyTitle without Tags"
+    assert _tags.duration == 1
+    assert _tags.album_artists == tuple()
+    assert _tags.artists == ("MyArtist",)
+    assert _tags.genres == tuple()
+    assert _tags.musicbrainz_albumartistids == tuple()
+    assert _tags.musicbrainz_artistids == tuple()
+    assert _tags.musicbrainz_releasegroupid is None
+    assert _tags.musicbrainz_trackid is None
+
+
+async def test_parse_metadata_from_invalid_filename():
+    """Test parsing of parsing metadata from (invalid) filename."""
+    filename = str(RESOURCES_DIR.joinpath("test.mp3"))
+    _tags = await tags.parse_tags(filename)
+    assert _tags.album is None
+    assert _tags.title == "test"
+    assert _tags.duration == 1
+    assert _tags.album_artists == tuple()
+    assert _tags.artists == (tags.UNKNOWN_ARTIST,)
+    assert _tags.genres == tuple()
+    assert _tags.musicbrainz_albumartistids == tuple()
+    assert _tags.musicbrainz_artistids == tuple()
+    assert _tags.musicbrainz_releasegroupid is None
+    assert _tags.musicbrainz_trackid is None