summaryrefslogtreecommitdiff
path: root/scrape_discogs_labels.py
diff options
context:
space:
mode:
Diffstat (limited to 'scrape_discogs_labels.py')
-rw-r--r--scrape_discogs_labels.py183
1 files changed, 183 insertions, 0 deletions
diff --git a/scrape_discogs_labels.py b/scrape_discogs_labels.py
new file mode 100644
index 0000000..7f6c6ff
--- /dev/null
+++ b/scrape_discogs_labels.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Scrape releases from Discogs labels and save to JSON for the music pipeline.
+
+Reads label URLs from Tom's bookmarks (Discogs folder) and scrapes releases.
+"""
+
+import json
+import re
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import requests
+
+HEADERS = {
+ "User-Agent": "MusicRecommender/1.0 +https://github.com/openclaw"
+}
+
+SCRIPTS_DIR = Path(__file__).parent
+
+
+def get_labels_from_bookmarks() -> list[tuple[int, str]]:
+ """Parse bookmarks and extract Discogs label IDs and names.
+
+ Only looks in Music > Discog Labels folder.
+ """
+ labels = []
+
+ # Run the decrypt script
+ try:
+ result = subprocess.run(
+ ["node", str(SCRIPTS_DIR / "decrypt_bookmarks.js")],
+ capture_output=True,
+ text=True,
+ timeout=30
+ )
+ bookmarks_xml = result.stdout
+ except Exception as e:
+ print(f"Error reading bookmarks: {e}", file=sys.stderr)
+ return labels
+
+ # Find the "Discog Labels" folder section
+ # Look for <folder...><title>Discog Labels</title>...</folder>
+ folder_pattern = r'<folder[^>]*>\s*<title>Discog Labels</title>(.*?)</folder>'
+ folder_match = re.search(folder_pattern, bookmarks_xml, re.DOTALL | re.IGNORECASE)
+
+ if not folder_match:
+ print("Could not find 'Discog Labels' folder in bookmarks", file=sys.stderr)
+ return labels
+
+ folder_content = folder_match.group(1)
+
+ # Find Discogs label URLs within that folder
+ # Pattern: https://www.discogs.com/label/6458-Indochina
+ pattern = r'href="https://www\.discogs\.com/label/(\d+)-([^"?]+)'
+
+ for match in re.finditer(pattern, folder_content):
+ label_id = int(match.group(1))
+ label_name = match.group(2).replace("-", " ")
+ labels.append((label_id, label_name))
+
+ return labels
+
+def get_label_releases(label_id: int, label_name: str, max_pages: int = 5) -> list[dict]:
+ """Fetch releases from a Discogs label."""
+ releases = []
+
+ for page in range(1, max_pages + 1):
+ url = f"https://api.discogs.com/labels/{label_id}/releases?page={page}&per_page=100"
+ print(f" Fetching {label_name} page {page}...", file=sys.stderr)
+
+ try:
+ resp = requests.get(url, headers=HEADERS, timeout=30)
+ resp.raise_for_status()
+ data = resp.json()
+ except Exception as e:
+ print(f" Error: {e}", file=sys.stderr)
+ break
+
+ for r in data.get("releases", []):
+ # Skip compilations, singles, etc - focus on albums
+ if r.get("format") and "Album" not in str(r.get("format", "")):
+ # Still include if no format specified
+ pass
+
+ artist = r.get("artist", "Various")
+ title = r.get("title", "")
+ year = r.get("year", "")
+
+ # Clean up artist name
+ artist = re.sub(r'\s*\(\d+\)$', '', artist) # Remove disambiguation numbers
+
+ if artist and title and artist.lower() != "various":
+ releases.append({
+ "artist": artist,
+ "album": title,
+ "year": year,
+ "label": label_name,
+ "discogs_id": r.get("id"),
+ })
+
+ # Check if more pages
+ if page >= data.get("pagination", {}).get("pages", 1):
+ break
+
+ time.sleep(1) # Rate limit
+
+ return releases
+
+def get_labels_from_config() -> list[tuple[int, str]]:
+ """Get Discogs labels from music_config.json."""
+ labels = []
+ config_path = SCRIPTS_DIR / "music_config.json"
+
+ if not config_path.exists():
+ return labels
+
+ try:
+ with open(config_path) as f:
+ config = json.load(f)
+
+ for entry in config.get("discogs_labels", []):
+ url = entry.get("url", "")
+ name = entry.get("name", "")
+ # Extract ID from URL like https://www.discogs.com/label/6170-Tempa
+ match = re.search(r'/label/(\d+)', url)
+ if match and name:
+ labels.append((int(match.group(1)), name))
+ except Exception as e:
+ print(f"Error reading config: {e}", file=sys.stderr)
+
+ return labels
+
+
+def main():
+ # Get labels from bookmarks and config
+ labels = get_labels_from_bookmarks()
+ config_labels = get_labels_from_config()
+
+ # Merge, avoiding duplicates by ID
+ seen_ids = {lid for lid, _ in labels}
+ for lid, lname in config_labels:
+ if lid not in seen_ids:
+ labels.append((lid, lname))
+ seen_ids.add(lid)
+
+ if not labels:
+ print("No Discogs labels found in bookmarks or config!", file=sys.stderr)
+ sys.exit(1)
+
+ print(f"Found {len(labels)} labels in bookmarks:", file=sys.stderr)
+ for lid, lname in labels:
+ print(f" - {lname} ({lid})", file=sys.stderr)
+
+ all_releases = []
+
+ for label_id, label_name in labels:
+ print(f"Scraping {label_name}...", file=sys.stderr)
+ releases = get_label_releases(label_id, label_name)
+ all_releases.extend(releases)
+ print(f" Found {len(releases)} releases", file=sys.stderr)
+ time.sleep(2) # Be nice to Discogs
+
+ # Dedupe by artist+album
+ seen = set()
+ unique = []
+ for r in all_releases:
+ key = f"{r['artist'].lower()}|{r['album'].lower()}"
+ if key not in seen:
+ seen.add(key)
+ unique.append(r)
+
+ output = {
+ "labels": [{"id": lid, "name": lname} for lid, lname in labels],
+ "releases": unique,
+ "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+ }
+
+ print(json.dumps(output, indent=2))
+
+if __name__ == "__main__":
+ main()