From c7956ae9b228054d57897ea338ad4154cc0b7221 Mon Sep 17 00:00:00 2001 From: Caine Date: Sun, 15 Feb 2026 09:41:49 +0000 Subject: Initial commit: susan automation scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Overnight transcoding, music discovery/import, system health reports, stats page generator, and bookmark management. Secrets stored in /etc/automation/ — not in repo. --- scrape_discogs_labels.py | 183 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 scrape_discogs_labels.py (limited to 'scrape_discogs_labels.py') diff --git a/scrape_discogs_labels.py b/scrape_discogs_labels.py new file mode 100644 index 0000000..7f6c6ff --- /dev/null +++ b/scrape_discogs_labels.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Scrape releases from Discogs labels and save to JSON for the music pipeline. + +Reads label URLs from Tom's bookmarks (Discogs folder) and scrapes releases. +""" + +import json +import re +import subprocess +import sys +import time +from pathlib import Path + +import requests + +HEADERS = { + "User-Agent": "MusicRecommender/1.0 +https://github.com/openclaw" +} + +SCRIPTS_DIR = Path(__file__).parent + + +def get_labels_from_bookmarks() -> list[tuple[int, str]]: + """Parse bookmarks and extract Discogs label IDs and names. + + Only looks in Music > Discog Labels folder. + """ + labels = [] + + # Run the decrypt script + try: + result = subprocess.run( + ["node", str(SCRIPTS_DIR / "decrypt_bookmarks.js")], + capture_output=True, + text=True, + timeout=30 + ) + bookmarks_xml = result.stdout + except Exception as e: + print(f"Error reading bookmarks: {e}", file=sys.stderr) + return labels + + # Find the "Discog Labels" folder section + # Look for Discog Labels... + folder_pattern = r']*>\s*Discog Labels(.*?)' + folder_match = re.search(folder_pattern, bookmarks_xml, re.DOTALL | re.IGNORECASE) + + if not folder_match: + print("Could not find 'Discog Labels' folder in bookmarks", file=sys.stderr) + return labels + + folder_content = folder_match.group(1) + + # Find Discogs label URLs within that folder + # Pattern: https://www.discogs.com/label/6458-Indochina + pattern = r'href="https://www\.discogs\.com/label/(\d+)-([^"?]+)' + + for match in re.finditer(pattern, folder_content): + label_id = int(match.group(1)) + label_name = match.group(2).replace("-", " ") + labels.append((label_id, label_name)) + + return labels + +def get_label_releases(label_id: int, label_name: str, max_pages: int = 5) -> list[dict]: + """Fetch releases from a Discogs label.""" + releases = [] + + for page in range(1, max_pages + 1): + url = f"https://api.discogs.com/labels/{label_id}/releases?page={page}&per_page=100" + print(f" Fetching {label_name} page {page}...", file=sys.stderr) + + try: + resp = requests.get(url, headers=HEADERS, timeout=30) + resp.raise_for_status() + data = resp.json() + except Exception as e: + print(f" Error: {e}", file=sys.stderr) + break + + for r in data.get("releases", []): + # Skip compilations, singles, etc - focus on albums + if r.get("format") and "Album" not in str(r.get("format", "")): + # Still include if no format specified + pass + + artist = r.get("artist", "Various") + title = r.get("title", "") + year = r.get("year", "") + + # Clean up artist name + artist = re.sub(r'\s*\(\d+\)$', '', artist) # Remove disambiguation numbers + + if artist and title and artist.lower() != "various": + releases.append({ + "artist": artist, + "album": title, + "year": year, + "label": label_name, + "discogs_id": r.get("id"), + }) + + # Check if more pages + if page >= data.get("pagination", {}).get("pages", 1): + break + + time.sleep(1) # Rate limit + + return releases + +def get_labels_from_config() -> list[tuple[int, str]]: + """Get Discogs labels from music_config.json.""" + labels = [] + config_path = SCRIPTS_DIR / "music_config.json" + + if not config_path.exists(): + return labels + + try: + with open(config_path) as f: + config = json.load(f) + + for entry in config.get("discogs_labels", []): + url = entry.get("url", "") + name = entry.get("name", "") + # Extract ID from URL like https://www.discogs.com/label/6170-Tempa + match = re.search(r'/label/(\d+)', url) + if match and name: + labels.append((int(match.group(1)), name)) + except Exception as e: + print(f"Error reading config: {e}", file=sys.stderr) + + return labels + + +def main(): + # Get labels from bookmarks and config + labels = get_labels_from_bookmarks() + config_labels = get_labels_from_config() + + # Merge, avoiding duplicates by ID + seen_ids = {lid for lid, _ in labels} + for lid, lname in config_labels: + if lid not in seen_ids: + labels.append((lid, lname)) + seen_ids.add(lid) + + if not labels: + print("No Discogs labels found in bookmarks or config!", file=sys.stderr) + sys.exit(1) + + print(f"Found {len(labels)} labels in bookmarks:", file=sys.stderr) + for lid, lname in labels: + print(f" - {lname} ({lid})", file=sys.stderr) + + all_releases = [] + + for label_id, label_name in labels: + print(f"Scraping {label_name}...", file=sys.stderr) + releases = get_label_releases(label_id, label_name) + all_releases.extend(releases) + print(f" Found {len(releases)} releases", file=sys.stderr) + time.sleep(2) # Be nice to Discogs + + # Dedupe by artist+album + seen = set() + unique = [] + for r in all_releases: + key = f"{r['artist'].lower()}|{r['album'].lower()}" + if key not in seen: + seen.add(key) + unique.append(r) + + output = { + "labels": [{"id": lid, "name": lname} for lid, lname in labels], + "releases": unique, + "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + print(json.dumps(output, indent=2)) + +if __name__ == "__main__": + main() -- cgit v1.2.3