From: OzGav Date: Mon, 2 Feb 2026 08:37:27 +0000 (+1100) Subject: Fix track import with multiple artists and mixed separators (#3065) X-Git-Url: https://git.kitaultman.com/?a=commitdiff_plain;h=f9dbf101332363be593d9d5fbcb18275693fd0a3;p=music-assistant-server.git Fix track import with multiple artists and mixed separators (#3065) * Fix metadata with triple artist track * Increase robustness of artist name parsing logic * Add TODO comment --- diff --git a/music_assistant/helpers/tags.py b/music_assistant/helpers/tags.py index 2623251a..57a1a508 100644 --- a/music_assistant/helpers/tags.py +++ b/music_assistant/helpers/tags.py @@ -59,39 +59,164 @@ def split_items( return clean_tuple((org_str,)) +# Artist splitting logic: +# When not using the multi-artist tag (ARTISTS), the artist string may contain +# multiple artists in freeform. Featuring artists may also be included in this +# string. We parse and separate them based on common splitter patterns. +# +# We use the MusicBrainz Artist ID count as a guide for how many artists to extract: +# - 0 IDs: only split on "featuring" splitters (to capture feat. artists in DB) +# - 1 ID: don't split at all (single artist confirmed) +# - 2+ IDs: split on featuring first, then extra splitters until we reach the target count +# +# TODO: If a MusicBrainz mirror/local database was available, artist names could be +# looked up directly using the MB Artist IDs from the tags, eliminating the need for +# ARTISTS tag parsing or ARTIST tag splitting entirely. +# +# Featuring splitters - always split on these to capture featuring artists in the database +FEATURING_SPLITTERS = [ + " featuring ", + " Featuring ", + " feat. ", + " Feat. ", + " feat ", + " Feat ", + " duet with ", + " Duet With ", + " ft. ", + " Ft. ", + " vs. ", + " Vs. ", +] + +# Extra splitters - only use these when we have MB ID evidence of multiple artists +EXTRA_SPLITTERS = [" & ", ", ", " + ", " with ", " With "] + + +def _split_on_featuring(item: str) -> list[str]: + """Split a string on featuring splitters, returns list of parts.""" + for splitter in FEATURING_SPLITTERS: + if splitter in item: + parts = [] + for subitem in item.split(splitter): + clean_item = subitem.strip() + if clean_item: + # Recursively process each part for nested featuring splitters + parts.extend(_split_on_featuring(clean_item)) + return parts + return [item] + + +def _split_to_target_count( + artists: list[str], + expected_count: int, + org_artists: str | tuple[str, ...], +) -> list[str]: + """Split artists on extra splitters to reach expected count. + + :param artists: List of artists after featuring splits. + :param expected_count: Target number of artists. + :param org_artists: Original input for logging. + """ + current_artists = list(artists) + stopped_early = False + + # Keep iterating until we reach the target or can't split anymore + while len(current_artists) < expected_count: + made_progress = False + + for i, item in enumerate(current_artists): + if len(current_artists) >= expected_count: + break + + for splitter in EXTRA_SPLITTERS: + if splitter not in item: + continue + + parts = [p.strip() for p in item.split(splitter) if p.strip()] + if len(parts) <= 1: + continue + + potential_count = len(current_artists) - 1 + len(parts) + + if potential_count <= expected_count: + # Safe to split fully - replace item with its parts + current_artists = current_artists[:i] + parts + current_artists[i + 1 :] + made_progress = True + else: + # Splitting would exceed target - do partial split + needed = expected_count - len(current_artists) + 1 + if needed >= 2: + new_parts = [*parts[: needed - 1], splitter.join(parts[needed - 1 :])] + current_artists = current_artists[:i] + new_parts + current_artists[i + 1 :] + made_progress = True + stopped_early = True + break # Only use first matching splitter for this item + + if made_progress: + break # Restart the outer loop with updated list + + if not made_progress: + break # No more splitting possible + + # Remove duplicates while preserving order + seen: set[str] = set() + final_artists = [] + for artist in current_artists: + if artist and artist not in seen: + seen.add(artist) + final_artists.append(artist) + + if stopped_early: + LOGGER.warning( + "Artist splitting stopped early to match expected count %d: '%s'", + expected_count, + org_artists, + ) + elif len(final_artists) < expected_count: + LOGGER.warning( + "Could not split artist string to match expected count %d (got %d): '%s'", + expected_count, + len(final_artists), + org_artists, + ) + + return final_artists + + def split_artists( - org_artists: str | tuple[str, ...], allow_extra_splitters: bool = False + org_artists: str | tuple[str, ...], + expected_count: int | None = None, ) -> tuple[str, ...]: - """Parse all artists from a string.""" - final_artists: list[str] = [] - # when not using the multi artist tag, the artist string may contain - # multiple artists in freeform, even featuring artists may be included in this - # string. Try to parse the featuring artists and separate them. - splitters = [ - " featuring ", - " feat. ", - " feat ", - " duet with ", - " with ", - " ft. ", - " vs. ", - ] - splitters += [x.title() for x in splitters] - if allow_extra_splitters: - splitters += [" & ", ", ", " + "] + """Parse artists from a string, guided by expected artist count. + + :param org_artists: The artist string or tuple of strings to parse. + :param expected_count: Expected number of artists (typically from MB artist IDs). + If None or 0: only split on "featuring" splitters, no extra splitting. + If 1: return as-is without any splitting. + If > 1: split on featuring splitters first, then extra splitters to reach target. + """ artists = split_items(org_artists, allow_unsafe_splitters=False) - for item in artists: - for splitter in splitters: - if splitter not in item: - continue - for subitem in item.split(splitter): - clean_item = subitem.strip() - if clean_item and clean_item not in final_artists: - final_artists.append(subitem.strip()) - if not final_artists: - # none of the extra splitters was found + + # If expected_count is 1, return as-is without any splitting + if expected_count == 1: return artists - return tuple(final_artists) + + # Step 1: Always split on featuring splitters + final_artists: list[str] = [] + for item in artists: + for part in _split_on_featuring(item): + if part and part not in final_artists: + final_artists.append(part) + + # Step 2: If no expected_count or already at/above target, we're done + if not expected_count or expected_count <= 1 or len(final_artists) >= expected_count: + return tuple(final_artists) if final_artists else artists + + # Step 3: Need more artists - split on extra splitters to reach expected_count + final_artists = _split_to_target_count(final_artists, expected_count, org_artists) + + return tuple(final_artists) if final_artists else artists @dataclass @@ -155,39 +280,36 @@ class AudioTags: @property def artists(self) -> tuple[str, ...]: """Return track artists.""" - # prefer multi-artist tag + # prefer multi-artist tag (ARTISTS plural) if tag := self.tags.get("artists"): - return split_items(tag) + artists = split_items(tag) + # Warn if ARTISTS tag count doesn't match MB Artist ID count + mb_id_count = len(self.musicbrainz_artistids) + if mb_id_count and mb_id_count != len(artists): + LOGGER.warning( + "ARTISTS tag count (%d) does not match MusicBrainz Artist ID count (%d): %s", + len(artists), + mb_id_count, + tag, + ) + return artists # fallback to regular artist string if tag := self.tags.get("artist"): if TAG_SPLITTER in tag: return split_items(tag) - if len(self.musicbrainz_artistids) > 1: - # special case: artist noted as 2 artists with ampersand or other splitter - # but with 2 mb ids so they should be treated as 2 artists - # example: John Travolta & Olivia Newton John on the Grease album - return split_artists(tag, allow_extra_splitters=True) - - # Check if we have evidence of a SINGLE artist (should NOT split) - has_single_mb_id = len(self.musicbrainz_artistids) == 1 - artists_plural = self.tags.get("artists", "") - has_single_in_artists_tag = artists_plural and TAG_SPLITTER not in artists_plural - - if has_single_mb_id or has_single_in_artists_tag: - # Single artist confirmed by either single MB ID or ARTISTS tag without semicolons - # Return as-is without splitting to avoid incorrectly splitting artist names - # containing "with", "featuring", etc. - # Example: "Jerk With a Bomb" should not be split into "Jerk" and "a Bomb" - return (tag,) - - # No evidence of single artist, proceed with splitting - return split_artists(tag) + # Use MB artist ID count to guide splitting + # - 0 IDs: only split on "feat." etc., not on "&" or "," + # - 1 ID: don't split at all + # - 2+ IDs: split to match the expected count + mb_id_count = len(self.musicbrainz_artistids) + return split_artists(tag, expected_count=mb_id_count if mb_id_count else None) # fallback to parsing from filename title = self.filename.rsplit(os.sep, 1)[-1].split(".")[0] if " - " in title: title_parts = title.split(" - ") if len(title_parts) >= 2: - return split_artists(title_parts[0]) + # No MB IDs from filename, only split on featuring splitters + return split_artists(title_parts[0], expected_count=None) return (UNKNOWN_ARTIST,) @property @@ -200,39 +322,34 @@ class AudioTags: if tag := self.tags.get("writer"): if TAG_SPLITTER in tag: return split_items(tag) - return split_artists(tag) + # No MB IDs for writers, only split on featuring splitters + return split_artists(tag, expected_count=None) return () @property def album_artists(self) -> tuple[str, ...]: """Return (all) album artists (if any).""" - # prefer multi-artist tag + # prefer multi-artist tag (ALBUMARTISTS plural) if tag := self.tags.get("albumartists"): - return split_items(tag) - # fallback to regular artist string + artists = split_items(tag) + # Warn if ALBUMARTISTS tag count doesn't match MB Album Artist ID count + mb_id_count = len(self.musicbrainz_albumartistids) + if mb_id_count and mb_id_count != len(artists): + LOGGER.warning( + "ALBUMARTISTS tag count (%d) does not match MusicBrainz Album Artist ID " + "count (%d): %s", + len(artists), + mb_id_count, + tag, + ) + return artists + # fallback to regular album artist string if tag := self.tags.get("albumartist"): if TAG_SPLITTER in tag: return split_items(tag) - if len(self.musicbrainz_albumartistids) > 1: - # special case: album artist noted as 2 artists with ampersand or other splitter - # but with 2 mb ids so they should be treated as 2 artists - # example: John Travolta & Olivia Newton John on the Grease album - return split_artists(tag, allow_extra_splitters=True) - - # Check if we have evidence of a SINGLE album artist (should NOT split) - has_single_mb_id = len(self.musicbrainz_albumartistids) == 1 - albumartists_plural = self.tags.get("albumartists", "") - has_single_in_albumartists_tag = ( - albumartists_plural and TAG_SPLITTER not in albumartists_plural - ) - - if has_single_mb_id or has_single_in_albumartists_tag: - # Single album artist confirmed by either single MB ID or ALBUMARTISTS tag - # without semicolons. Return as-is without splitting. - return (tag,) - - # No evidence of single artist, proceed with splitting - return split_artists(tag) + # Use MB album artist ID count to guide splitting + mb_id_count = len(self.musicbrainz_albumartistids) + return split_artists(tag, expected_count=mb_id_count if mb_id_count else None) return () @property diff --git a/tests/core/test_tags.py b/tests/core/test_tags.py index acd0a11b..172d1fe4 100644 --- a/tests/core/test_tags.py +++ b/tests/core/test_tags.py @@ -4,6 +4,7 @@ import pathlib from music_assistant.constants import UNKNOWN_ARTIST from music_assistant.helpers import tags +from music_assistant.helpers.tags import split_artists RESOURCES_DIR = pathlib.Path(__file__).parent.parent.resolve().joinpath("fixtures") @@ -103,3 +104,85 @@ async def test_parse_metadata_from_invalid_filename() -> None: assert _tags.musicbrainz_artistids == () assert _tags.musicbrainz_releasegroupid is None assert _tags.musicbrainz_recordingid is None + + +def test_split_artists_with_expected_count() -> None: + """Test splitting artists guided by expected count (from MB IDs).""" + # With expected_count=3, should split on extra splitters to reach target + result = split_artists("Shabson, Krgovich & Harris", expected_count=3) + assert result == ("Shabson", "Krgovich", "Harris") + + # With expected_count=3, ampersands should split + result = split_artists("Shabson & Krgovich & Harris", expected_count=3) + assert result == ("Shabson", "Krgovich", "Harris") + + # With expected_count=3, commas should split + result = split_artists("Shabson, Krgovich, Harris", expected_count=3) + assert result == ("Shabson", "Krgovich", "Harris") + + # With expected_count=1, should NOT split at all + result = split_artists("Shabson & Krgovich", expected_count=1) + assert result == ("Shabson & Krgovich",) + + # With expected_count=None (no MB IDs), should NOT split on extra splitters + result = split_artists("Shabson & Krgovich", expected_count=None) + assert result == ("Shabson & Krgovich",) + + # With expected_count=0 (no MB IDs), should NOT split on extra splitters + result = split_artists("Shabson & Krgovich", expected_count=0) + assert result == ("Shabson & Krgovich",) + + +def test_split_artists_featuring() -> None: + """Test that featuring splitters always work regardless of expected_count.""" + # "feat." should always split, even with no expected_count + result = split_artists("John Lennon feat. Yoko Ono", expected_count=None) + assert result == ("John Lennon", "Yoko Ono") + + # "feat." should split even with expected_count=1 (featuring overrides) + # Actually, expected_count=1 means single artist, so we return as-is + result = split_artists("John Lennon feat. Yoko Ono", expected_count=1) + assert result == ("John Lennon feat. Yoko Ono",) + + # "featuring" should work + result = split_artists("Artist A featuring Artist B", expected_count=None) + assert result == ("Artist A", "Artist B") + + # "ft." should work + result = split_artists("Artist A ft. Artist B", expected_count=None) + assert result == ("Artist A", "Artist B") + + +def test_split_artists_no_oversplit() -> None: + """Test that split_artists stops at expected_count and doesn't over-split.""" + # Hall & Oates is a duo, with 2 MB IDs we should split on feat. first + # and get exactly 2 artists + result = split_artists("Hall & Oates feat. David Ruffin", expected_count=2) + assert result == ("Hall & Oates", "David Ruffin") + + # With 3 MB IDs, we should split further + result = split_artists("Hall & Oates feat. David Ruffin", expected_count=3) + assert result == ("Hall", "Oates", "David Ruffin") + + # Simon & Garfunkel with 1 MB ID (the duo) should stay as one + result = split_artists("Simon & Garfunkel", expected_count=1) + assert result == ("Simon & Garfunkel",) + + # Simon & Garfunkel with 2 MB IDs (Paul + Art) should split + result = split_artists("Simon & Garfunkel", expected_count=2) + assert result == ("Simon", "Garfunkel") + + +def test_split_artists_with_not_split() -> None: + """Test that 'with' is only split when we have MB ID evidence.""" + # "with" should NOT split without expected_count (could be artist name) + result = split_artists("Jerk With a Bomb", expected_count=None) + assert result == ("Jerk With a Bomb",) + + # "with" should NOT split with expected_count=1 + result = split_artists("Jerk With a Bomb", expected_count=1) + assert result == ("Jerk With a Bomb",) + + # "with" SHOULD split when expected_count=2 indicates multiple artists + result = split_artists("Artist A with Artist B", expected_count=2) + assert result == ("Artist A", "Artist B")