From d33c59b0636b0cc497b43cdfa08d854a184a312a Mon Sep 17 00:00:00 2001 From: sprocket-9 Date: Sat, 4 May 2024 22:25:02 +0100 Subject: [PATCH] Fix: Sanitize radio stream title (#1267) --- music_assistant/common/helpers/util.py | 83 +++++++++++++++++++++++++ music_assistant/server/helpers/audio.py | 20 +++++- tests/test_radio_stream_title.py | 71 +++++++++++++++++++++ 3 files changed, 171 insertions(+), 3 deletions(-) create mode 100644 tests/test_radio_stream_title.py diff --git a/music_assistant/common/helpers/util.py b/music_assistant/common/helpers/util.py index 989588a7..bbf28c18 100644 --- a/music_assistant/common/helpers/util.py +++ b/music_assistant/common/helpers/util.py @@ -4,9 +4,11 @@ from __future__ import annotations import asyncio import os +import re import socket from collections.abc import Callable from typing import Any, TypeVar +from urllib.parse import urlparse from uuid import UUID # pylint: disable=invalid-name @@ -16,6 +18,15 @@ CALLABLE_T = TypeVar("CALLABLE_T", bound=Callable) CALLBACK_TYPE = Callable[[], None] # pylint: enable=invalid-name +keyword_pattern = re.compile("title=|artist=") +title_pattern = re.compile(r"title=\"(?P.*?)\"") +artist_pattern = re.compile(r"artist=\"(?P<artist>.*?)\"") +dot_com_pattern = re.compile(r"(?P<netloc>\(?\w+\.(?:\w+\.)?(\w{2,3})\)?)") +ad_pattern = re.compile(r"((ad|advertisement)_)|^AD\s\d+$|ADBREAK", flags=re.I) +title_artist_order_pattern = re.compile(r"(?P<title>.+)\sBy:\s(?P<artist>.+)", flags=re.I) +multi_space_pattern = re.compile(r"\s{2,}") +end_junk_pattern = re.compile(r"(.+?)(\s\W+)$") + def filename_from_string(string: str) -> str: """Create filename from unsafe string.""" @@ -144,6 +155,78 @@ def get_version_substitute(version_str: str): return version_str.strip() +def strip_ads(line: str) -> str: + """Strip Ads from line.""" + if ad_pattern.search(line): + return "Advert" + return line + + +def strip_url(line: str) -> str: + """Strip URL from line.""" + return ( + " ".join([p for p in line.split() if (not urlparse(p).scheme or not urlparse(p).netloc)]) + ).rstrip() + + +def strip_dotcom(line: str): + """Strip scheme-less netloc from line.""" + return dot_com_pattern.sub("", line) + + +def strip_end_junk(line: str) -> str: + """Strip non-word info from end of line.""" + return end_junk_pattern.sub(r"\1", line) + + +def swap_title_artist_order(line: str) -> str: + """Swap title/artist order in line.""" + return title_artist_order_pattern.sub(r"\g<artist> - \g<title>", line) + + +def strip_multi_space(line: str) -> str: + """Strip multi-whitespace from line.""" + return multi_space_pattern.sub(" ", line) + + +def multi_strip(line: str) -> str: + """Strip assorted junk from line.""" + return strip_multi_space( + swap_title_artist_order(strip_end_junk(strip_dotcom(strip_url(strip_ads(line))))) + ).rstrip() + + +def clean_stream_title(line: str) -> str: + """Strip junk text from radio streamtitle.""" + title: str = "" + artist: str = "" + + if not keyword_pattern.search(line): + return multi_strip(line) + + if match := title_pattern.search(line): + title = multi_strip(match.group("title")) + + if match := artist_pattern.search(line): + possible_artist = multi_strip(match.group("artist")) + if possible_artist and possible_artist != title: + artist = possible_artist + + if not title and not artist: + return "" + + if title: + if re.search(" - ", title) or not artist: + return title + if artist: + return f"{artist} - {title}" + + if artist: + return artist + + return line + + async def get_ip(): """Get primary IP-address for this host.""" diff --git a/music_assistant/server/helpers/audio.py b/music_assistant/server/helpers/audio.py index 9f2ead4a..d2029ebc 100644 --- a/music_assistant/server/helpers/audio.py +++ b/music_assistant/server/helpers/audio.py @@ -23,6 +23,7 @@ from music_assistant.common.helpers.global_cache import ( set_global_cache_values, ) from music_assistant.common.helpers.json import JSON_DECODE_EXCEPTIONS, json_loads +from music_assistant.common.helpers.util import clean_stream_title from music_assistant.common.models.enums import MediaType, StreamType from music_assistant.common.models.errors import ( AudioError, @@ -539,8 +540,13 @@ async def get_icy_stream( if not stream_title: continue stream_title = stream_title.group(1).decode() - if stream_title != streamdetails.stream_title: - streamdetails.stream_title = stream_title + cleaned_stream_title = clean_stream_title(stream_title) + if cleaned_stream_title != streamdetails.stream_title: + LOGGER.log(VERBOSE_LOG_LEVEL, "ICY Radio streamtitle original: %s", stream_title) + LOGGER.log( + VERBOSE_LOG_LEVEL, "ICY Radio streamtitle cleaned: %s", cleaned_stream_title + ) + streamdetails.stream_title = cleaned_stream_title async def get_hls_stream( @@ -596,7 +602,15 @@ async def get_hls_stream( logger.debug("Station support for in-playlist metadata: %s", has_playlist_metadata) if has_playlist_metadata and chunk_item.title != "no desc": # bbc (and maybe others?) set the title to 'no desc' - streamdetails.stream_title = chunk_item.title + cleaned_stream_title = clean_stream_title(chunk_item.title) + if cleaned_stream_title != streamdetails.stream_title: + logger.log( + VERBOSE_LOG_LEVEL, "HLS Radio streamtitle original: %s", chunk_item.title + ) + logger.log( + VERBOSE_LOG_LEVEL, "HLS Radio streamtitle cleaned: %s", cleaned_stream_title + ) + streamdetails.stream_title = cleaned_stream_title logger.log(VERBOSE_LOG_LEVEL, "playing chunk %s", chunk_item) # prevent that we play this chunk again if we loop through prev_chunks.append(chunk_item.path) diff --git a/tests/test_radio_stream_title.py b/tests/test_radio_stream_title.py new file mode 100644 index 00000000..8dbb5f9d --- /dev/null +++ b/tests/test_radio_stream_title.py @@ -0,0 +1,71 @@ +"""Tests for cleaning radio streamtitle.""" + +from music_assistant.common.helpers.util import clean_stream_title + + +def test_cleaning_streamtitle() -> None: + """Tests for cleaning radio streamtitle.""" + tstm = "Thirty Seconds To Mars - Closer to the Edge" + advert = "Advert" + + line = "Advertisement_Start_Length=00:00:29.960" + stream_title = clean_stream_title(line) + assert stream_title == advert + + line = "Advertisement_Stop" + stream_title = clean_stream_title(line) + assert stream_title == advert + + line = "START_AD_BREAK_6000" + stream_title = clean_stream_title(line) + assert stream_title == advert + + line = "STOP ADBREAK 1" + stream_title = clean_stream_title(line) + assert stream_title == advert + + line = "AD 2" + stream_title = clean_stream_title(line) + assert stream_title == advert + + line = 'title="Thirty Seconds To Mars - Closer to the Edge",artist="Thirty Seconds To Mars - Closer to the Edge",url="https://nowplaying.scahw.com.au/c/fd8ee07bed6a5e4e9824a11aa02dd34a.jpg?t=1714568458&l=250"' # noqa: E501 + stream_title = clean_stream_title(line) + assert stream_title == tstm + + line = 'title="https://listenapi.planetradio.co.uk/api9.2/eventdata/247801912",url="https://listenapi.planetradio.co.uk/api9.2/eventdata/247801912"' + stream_title = clean_stream_title(line) + assert stream_title == "" + + line = 'title="Thirty Seconds To Mars - Closer to the Edge https://nowplaying.scahw.com.au/",artist="Thirty Seconds To Mars - Closer to the Edge",url="https://nowplaying.scahw.com.au/c/fd8ee07bed6a5e4e9824a11aa02dd34a.jpg?t=1714568458&l=250"' # noqa: E501 + stream_title = clean_stream_title(line) + assert stream_title == tstm + + line = 'title="Closer to the Edge",artist="Thirty Seconds To Mars",url="https://nowplaying.scahw.com.au/c/fd8ee07bed6a5e4e9824a11aa02dd34a.jpg?t=1714568458&l=250"' + stream_title = clean_stream_title(line) + assert stream_title == tstm + + line = 'title="Thirty Seconds To Mars - Closer to the Edge"' + stream_title = clean_stream_title(line) + assert stream_title == tstm + + line = "Thirty Seconds To Mars - Closer to the Edge https://nowplaying.scahw.com.au/" + stream_title = clean_stream_title(line) + assert stream_title == tstm + + line = "Lonely Street By: Andy Williams - WALMRadio.com" + stream_title = clean_stream_title(line) + assert stream_title == "Andy Williams - Lonely Street" + + line = "Bye Bye Blackbird By: Sammy Davis Jr. - WALMRadio.com" + stream_title = clean_stream_title(line) + assert stream_title == "Sammy Davis Jr. - Bye Bye Blackbird" + + line = ( + "Asha Bhosle, Mohd Rafi (mp3yaar.com) - Gunguna Rahe Hain Bhanwre - Araadhna (mp3yaar.com)" + ) + stream_title = clean_stream_title(line) + assert stream_title == "Asha Bhosle, Mohd Rafi - Gunguna Rahe Hain Bhanwre - Araadhna" + + line = "Mohammed Rafi(Jatt.fm) - Rang Aur Noor Ki Baraat (Ghazal)(Jatt.fm)" + stream_title = clean_stream_title(line) + assert stream_title == "Mohammed Rafi - Rang Aur Noor Ki Baraat (Ghazal)" -- 2.34.1