#!/usr/bin/env python3 """Scrape releases from Discogs labels and save to JSON for the music pipeline. Reads label URLs from Tom's bookmarks (Discogs folder) and scrapes releases. """ import json import re import subprocess import sys import time from pathlib import Path import requests HEADERS = { "User-Agent": "MusicRecommender/1.0 +https://github.com/openclaw" } SCRIPTS_DIR = Path(__file__).parent def get_labels_from_bookmarks() -> list[tuple[int, str]]: """Parse bookmarks and extract Discogs label IDs and names. Only looks in Music > Discog Labels folder. """ labels = [] # Run the decrypt script try: result = subprocess.run( ["node", str(SCRIPTS_DIR / "decrypt_bookmarks.js")], capture_output=True, text=True, timeout=30 ) bookmarks_xml = result.stdout except Exception as e: print(f"Error reading bookmarks: {e}", file=sys.stderr) return labels # Find the "Discog Labels" folder section # Look for Discog Labels... folder_pattern = r']*>\s*Discog Labels(.*?)' folder_match = re.search(folder_pattern, bookmarks_xml, re.DOTALL | re.IGNORECASE) if not folder_match: print("Could not find 'Discog Labels' folder in bookmarks", file=sys.stderr) return labels folder_content = folder_match.group(1) # Find Discogs label URLs within that folder # Pattern: https://www.discogs.com/label/6458-Indochina pattern = r'href="https://www\.discogs\.com/label/(\d+)-([^"?]+)' for match in re.finditer(pattern, folder_content): label_id = int(match.group(1)) label_name = match.group(2).replace("-", " ") labels.append((label_id, label_name)) return labels def get_label_releases(label_id: int, label_name: str, max_pages: int = 5) -> list[dict]: """Fetch releases from a Discogs label.""" releases = [] for page in range(1, max_pages + 1): url = f"https://api.discogs.com/labels/{label_id}/releases?page={page}&per_page=100" print(f" Fetching {label_name} page {page}...", file=sys.stderr) try: resp = requests.get(url, headers=HEADERS, timeout=30) resp.raise_for_status() data = resp.json() except Exception as e: print(f" Error: {e}", file=sys.stderr) break for r in data.get("releases", []): # Skip compilations, singles, etc - focus on albums if r.get("format") and "Album" not in str(r.get("format", "")): # Still include if no format specified pass artist = r.get("artist", "Various") title = r.get("title", "") year = r.get("year", "") # Clean up artist name artist = re.sub(r'\s*\(\d+\)$', '', artist) # Remove disambiguation numbers if artist and title and artist.lower() != "various": releases.append({ "artist": artist, "album": title, "year": year, "label": label_name, "discogs_id": r.get("id"), }) # Check if more pages if page >= data.get("pagination", {}).get("pages", 1): break time.sleep(1) # Rate limit return releases def get_labels_from_config() -> list[tuple[int, str]]: """Get Discogs labels from music_config.json.""" labels = [] config_path = SCRIPTS_DIR / "music_config.json" if not config_path.exists(): return labels try: with open(config_path) as f: config = json.load(f) for entry in config.get("discogs_labels", []): url = entry.get("url", "") name = entry.get("name", "") # Extract ID from URL like https://www.discogs.com/label/6170-Tempa match = re.search(r'/label/(\d+)', url) if match and name: labels.append((int(match.group(1)), name)) except Exception as e: print(f"Error reading config: {e}", file=sys.stderr) return labels def main(): # Get labels from bookmarks and config labels = get_labels_from_bookmarks() config_labels = get_labels_from_config() # Merge, avoiding duplicates by ID seen_ids = {lid for lid, _ in labels} for lid, lname in config_labels: if lid not in seen_ids: labels.append((lid, lname)) seen_ids.add(lid) if not labels: print("No Discogs labels found in bookmarks or config!", file=sys.stderr) sys.exit(1) print(f"Found {len(labels)} labels in bookmarks:", file=sys.stderr) for lid, lname in labels: print(f" - {lname} ({lid})", file=sys.stderr) all_releases = [] for label_id, label_name in labels: print(f"Scraping {label_name}...", file=sys.stderr) releases = get_label_releases(label_id, label_name) all_releases.extend(releases) print(f" Found {len(releases)} releases", file=sys.stderr) time.sleep(2) # Be nice to Discogs # Dedupe by artist+album seen = set() unique = [] for r in all_releases: key = f"{r['artist'].lower()}|{r['album'].lower()}" if key not in seen: seen.add(key) unique.append(r) output = { "labels": [{"id": lid, "name": lname} for lid, lname in labels], "releases": unique, "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S"), } print(json.dumps(output, indent=2)) if __name__ == "__main__": main()