Fix: Sanitize radio stream title (#1267)
authorsprocket-9 <sprocketnumber9@gmail.com>
Sat, 4 May 2024 21:25:02 +0000 (22:25 +0100)
committerGitHub <noreply@github.com>
Sat, 4 May 2024 21:25:02 +0000 (23:25 +0200)
music_assistant/common/helpers/util.py
music_assistant/server/helpers/audio.py
tests/test_radio_stream_title.py [new file with mode: 0644]

index 989588a73be48aa291e0fe1592db7d021e26142d..bbf28c1845cdd3585064ad71d33b29f53f4444cb 100644 (file)
@@ -4,9 +4,11 @@ from __future__ import annotations
 
 import asyncio
 import os
+import re
 import socket
 from collections.abc import Callable
 from typing import Any, TypeVar
+from urllib.parse import urlparse
 from uuid import UUID
 
 # pylint: disable=invalid-name
@@ -16,6 +18,15 @@ CALLABLE_T = TypeVar("CALLABLE_T", bound=Callable)
 CALLBACK_TYPE = Callable[[], None]
 # pylint: enable=invalid-name
 
+keyword_pattern = re.compile("title=|artist=")
+title_pattern = re.compile(r"title=\"(?P<title>.*?)\"")
+artist_pattern = re.compile(r"artist=\"(?P<artist>.*?)\"")
+dot_com_pattern = re.compile(r"(?P<netloc>\(?\w+\.(?:\w+\.)?(\w{2,3})\)?)")
+ad_pattern = re.compile(r"((ad|advertisement)_)|^AD\s\d+$|ADBREAK", flags=re.I)
+title_artist_order_pattern = re.compile(r"(?P<title>.+)\sBy:\s(?P<artist>.+)", flags=re.I)
+multi_space_pattern = re.compile(r"\s{2,}")
+end_junk_pattern = re.compile(r"(.+?)(\s\W+)$")
+
 
 def filename_from_string(string: str) -> str:
     """Create filename from unsafe string."""
@@ -144,6 +155,78 @@ def get_version_substitute(version_str: str):
     return version_str.strip()
 
 
+def strip_ads(line: str) -> str:
+    """Strip Ads from line."""
+    if ad_pattern.search(line):
+        return "Advert"
+    return line
+
+
+def strip_url(line: str) -> str:
+    """Strip URL from line."""
+    return (
+        " ".join([p for p in line.split() if (not urlparse(p).scheme or not urlparse(p).netloc)])
+    ).rstrip()
+
+
+def strip_dotcom(line: str):
+    """Strip scheme-less netloc from line."""
+    return dot_com_pattern.sub("", line)
+
+
+def strip_end_junk(line: str) -> str:
+    """Strip non-word info from end of line."""
+    return end_junk_pattern.sub(r"\1", line)
+
+
+def swap_title_artist_order(line: str) -> str:
+    """Swap title/artist order in line."""
+    return title_artist_order_pattern.sub(r"\g<artist> - \g<title>", line)
+
+
+def strip_multi_space(line: str) -> str:
+    """Strip multi-whitespace from line."""
+    return multi_space_pattern.sub(" ", line)
+
+
+def multi_strip(line: str) -> str:
+    """Strip assorted junk from line."""
+    return strip_multi_space(
+        swap_title_artist_order(strip_end_junk(strip_dotcom(strip_url(strip_ads(line)))))
+    ).rstrip()
+
+
+def clean_stream_title(line: str) -> str:
+    """Strip junk text from radio streamtitle."""
+    title: str = ""
+    artist: str = ""
+
+    if not keyword_pattern.search(line):
+        return multi_strip(line)
+
+    if match := title_pattern.search(line):
+        title = multi_strip(match.group("title"))
+
+    if match := artist_pattern.search(line):
+        possible_artist = multi_strip(match.group("artist"))
+        if possible_artist and possible_artist != title:
+            artist = possible_artist
+
+    if not title and not artist:
+        return ""
+
+    if title:
+        if re.search(" - ", title) or not artist:
+            return title
+        if artist:
+            return f"{artist} - {title}"
+
+    if artist:
+        return artist
+
+    return line
+
+
 async def get_ip():
     """Get primary IP-address for this host."""
 
index 9f2ead4a65d7e1b14b660ae2b1f640bc220246bb..d2029ebc7a02e8690674ee4af52b1d79ca62da17 100644 (file)
@@ -23,6 +23,7 @@ from music_assistant.common.helpers.global_cache import (
     set_global_cache_values,
 )
 from music_assistant.common.helpers.json import JSON_DECODE_EXCEPTIONS, json_loads
+from music_assistant.common.helpers.util import clean_stream_title
 from music_assistant.common.models.enums import MediaType, StreamType
 from music_assistant.common.models.errors import (
     AudioError,
@@ -539,8 +540,13 @@ async def get_icy_stream(
             if not stream_title:
                 continue
             stream_title = stream_title.group(1).decode()
-            if stream_title != streamdetails.stream_title:
-                streamdetails.stream_title = stream_title
+            cleaned_stream_title = clean_stream_title(stream_title)
+            if cleaned_stream_title != streamdetails.stream_title:
+                LOGGER.log(VERBOSE_LOG_LEVEL, "ICY Radio streamtitle original: %s", stream_title)
+                LOGGER.log(
+                    VERBOSE_LOG_LEVEL, "ICY Radio streamtitle cleaned: %s", cleaned_stream_title
+                )
+                streamdetails.stream_title = cleaned_stream_title
 
 
 async def get_hls_stream(
@@ -596,7 +602,15 @@ async def get_hls_stream(
                 logger.debug("Station support for in-playlist metadata: %s", has_playlist_metadata)
             if has_playlist_metadata and chunk_item.title != "no desc":
                 # bbc (and maybe others?) set the title to 'no desc'
-                streamdetails.stream_title = chunk_item.title
+                cleaned_stream_title = clean_stream_title(chunk_item.title)
+                if cleaned_stream_title != streamdetails.stream_title:
+                    logger.log(
+                        VERBOSE_LOG_LEVEL, "HLS Radio streamtitle original: %s", chunk_item.title
+                    )
+                    logger.log(
+                        VERBOSE_LOG_LEVEL, "HLS Radio streamtitle cleaned: %s", cleaned_stream_title
+                    )
+                    streamdetails.stream_title = cleaned_stream_title
             logger.log(VERBOSE_LOG_LEVEL, "playing chunk %s", chunk_item)
             # prevent that we play this chunk again if we loop through
             prev_chunks.append(chunk_item.path)
diff --git a/tests/test_radio_stream_title.py b/tests/test_radio_stream_title.py
new file mode 100644 (file)
index 0000000..8dbb5f9
--- /dev/null
@@ -0,0 +1,71 @@
+"""Tests for cleaning radio streamtitle."""
+
+from music_assistant.common.helpers.util import clean_stream_title
+
+
+def test_cleaning_streamtitle() -> None:
+    """Tests for cleaning radio streamtitle."""
+    tstm = "Thirty Seconds To Mars - Closer to the Edge"
+    advert = "Advert"
+
+    line = "Advertisement_Start_Length=00:00:29.960"
+    stream_title = clean_stream_title(line)
+    assert stream_title == advert
+
+    line = "Advertisement_Stop"
+    stream_title = clean_stream_title(line)
+    assert stream_title == advert
+
+    line = "START_AD_BREAK_6000"
+    stream_title = clean_stream_title(line)
+    assert stream_title == advert
+
+    line = "STOP ADBREAK 1"
+    stream_title = clean_stream_title(line)
+    assert stream_title == advert
+
+    line = "AD 2"
+    stream_title = clean_stream_title(line)
+    assert stream_title == advert
+
+    line = 'title="Thirty Seconds To Mars - Closer to the Edge",artist="Thirty Seconds To Mars - Closer to the Edge",url="https://nowplaying.scahw.com.au/c/fd8ee07bed6a5e4e9824a11aa02dd34a.jpg?t=1714568458&l=250"'  # noqa: E501
+    stream_title = clean_stream_title(line)
+    assert stream_title == tstm
+
+    line = 'title="https://listenapi.planetradio.co.uk/api9.2/eventdata/247801912",url="https://listenapi.planetradio.co.uk/api9.2/eventdata/247801912"'
+    stream_title = clean_stream_title(line)
+    assert stream_title == ""
+
+    line = 'title="Thirty Seconds To Mars - Closer to the Edge https://nowplaying.scahw.com.au/",artist="Thirty Seconds To Mars - Closer to the Edge",url="https://nowplaying.scahw.com.au/c/fd8ee07bed6a5e4e9824a11aa02dd34a.jpg?t=1714568458&l=250"'  # noqa: E501
+    stream_title = clean_stream_title(line)
+    assert stream_title == tstm
+
+    line = 'title="Closer to the Edge",artist="Thirty Seconds To Mars",url="https://nowplaying.scahw.com.au/c/fd8ee07bed6a5e4e9824a11aa02dd34a.jpg?t=1714568458&l=250"'
+    stream_title = clean_stream_title(line)
+    assert stream_title == tstm
+
+    line = 'title="Thirty Seconds To Mars - Closer to the Edge"'
+    stream_title = clean_stream_title(line)
+    assert stream_title == tstm
+
+    line = "Thirty Seconds To Mars - Closer to the Edge https://nowplaying.scahw.com.au/"
+    stream_title = clean_stream_title(line)
+    assert stream_title == tstm
+
+    line = "Lonely Street  By: Andy Williams - WALMRadio.com"
+    stream_title = clean_stream_title(line)
+    assert stream_title == "Andy Williams - Lonely Street"
+
+    line = "Bye Bye Blackbird  By: Sammy Davis Jr. - WALMRadio.com"
+    stream_title = clean_stream_title(line)
+    assert stream_title == "Sammy Davis Jr. - Bye Bye Blackbird"
+
+    line = (
+        "Asha Bhosle, Mohd Rafi (mp3yaar.com) - Gunguna Rahe Hain Bhanwre - Araadhna (mp3yaar.com)"
+    )
+    stream_title = clean_stream_title(line)
+    assert stream_title == "Asha Bhosle, Mohd Rafi - Gunguna Rahe Hain Bhanwre - Araadhna"
+
+    line = "Mohammed Rafi(Jatt.fm) - Rang Aur Noor Ki Baraat (Ghazal)(Jatt.fm)"
+    stream_title = clean_stream_title(line)
+    assert stream_title == "Mohammed Rafi - Rang Aur Noor Ki Baraat (Ghazal)"