diff options
| -rw-r--r-- | .gitignore | 14 | ||||
| -rw-r--r-- | README.md | 23 | ||||
| -rw-r--r-- | add_bookmark.js | 76 | ||||
| -rwxr-xr-x | add_bookmark_to_wishlist.js | 97 | ||||
| -rw-r--r-- | decrypt_bookmarks.js | 34 | ||||
| -rw-r--r-- | generate_stats_page.py | 285 | ||||
| -rwxr-xr-x | import_music.sh | 131 | ||||
| -rw-r--r-- | morning_report.py | 438 | ||||
| -rw-r--r-- | music_config.example.json | 15 | ||||
| -rwxr-xr-x | music_recommender.py | 1038 | ||||
| -rwxr-xr-x | overnight_transcoder.py | 1245 | ||||
| -rw-r--r-- | scrape_discogs_labels.py | 183 | ||||
| -rwxr-xr-x | transcode_album.sh | 81 |
13 files changed, 3660 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..00150fb --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Secrets & config with credentials +music_config.json +music_state.json +bookmarks.json + +# Generated data +discogs_labels.json +__pycache__/ +*.pyc + +# Editor +*.swp +*.swo +*~ diff --git a/README.md b/README.md new file mode 100644 index 0000000..b1a2f9b --- /dev/null +++ b/README.md @@ -0,0 +1,23 @@ +# susan-scripts + +Automation scripts for Susan (home server). Managed by Caine. + +## Scripts + +| Script | Purpose | +|--------|---------| +| `overnight_transcoder.py` | Nightly HEVC transcoding of video library | +| `morning_report.py` | Daily system health report (email) | +| `music_recommender.py` | Last.fm-based music discovery + Soulseek download | +| `import_music.sh` | FLAC→Opus transcoding + beets tagging for new albums | +| `transcode_album.sh` | Manual album transcode helper | +| `scrape_discogs_labels.py` | Scrape Discogs labels for music pipeline | +| `generate_stats_page.py` | Retro stats page generator | +| `decrypt_bookmarks.js` | Floccus bookmark decryption | +| `add_bookmark.js` | Add bookmark to Floccus XBEL | +| `add_bookmark_to_wishlist.js` | Add bookmark to wishlist folder | + +## Setup + +- Copy `music_config.example.json` to `music_config.json` and fill in credentials. +- Bookmark scripts read password from `/etc/automation/bookmarks.json`. diff --git a/add_bookmark.js b/add_bookmark.js new file mode 100644 index 0000000..9a68789 --- /dev/null +++ b/add_bookmark.js @@ -0,0 +1,76 @@ +#!/usr/bin/env node +// Add a bookmark to Floccus encrypted bookmarks +// Usage: node add_bookmark.js <url> <title> + +const crypto = require('crypto'); +const fs = require('fs'); + +const BOOKMARKS_PATH = '/var/www/webdav/bookmarks.xbel'; +const PASSWORD = process.env.BOOKMARKS_PASSWORD || JSON.parse(require('fs').readFileSync('/etc/automation/bookmarks.json', 'utf8')).password; + +const url = process.argv[2]; +const title = process.argv[3]; + +if (!url || !title) { + console.log('Usage: node add_bookmark.js <url> <title>'); + process.exit(1); +} + +function decrypt(data, password) { + const ciphertext = Buffer.from(data.ciphertext, 'base64'); + const salt = data.salt; + const key = crypto.pbkdf2Sync(password, salt, 250000, 32, 'sha256'); + const iv = ciphertext.slice(0, 16); + const encrypted = ciphertext.slice(16, -16); + const tag = ciphertext.slice(-16); + + const decipher = crypto.createDecipheriv('aes-256-gcm', key, iv); + decipher.setAuthTag(tag); + return Buffer.concat([decipher.update(encrypted), decipher.final()]).toString('utf8'); +} + +function encrypt(plaintext, password, salt) { + const key = crypto.pbkdf2Sync(password, salt, 250000, 32, 'sha256'); + const iv = crypto.randomBytes(16); + + const cipher = crypto.createCipheriv('aes-256-gcm', key, iv); + const encrypted = Buffer.concat([cipher.update(plaintext, 'utf8'), cipher.final()]); + const tag = cipher.getAuthTag(); + + const combined = Buffer.concat([iv, encrypted, tag]); + return combined.toString('base64'); +} + +// Read and decrypt +const data = JSON.parse(fs.readFileSync(BOOKMARKS_PATH, 'utf8')); +let xml = decrypt(data, PASSWORD); + +// Find highest ID +const idMatch = xml.match(/highestId :(\d+):/); +let highestId = idMatch ? parseInt(idMatch[1]) : 100; +const newId = highestId + 1; + +// Escape XML entities +const escapeXml = (str) => str + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + +// Add bookmark +const newBookmark = `<bookmark href="${escapeXml(url)}" id="${newId}"> + <title>${escapeXml(title)}</title> + </bookmark> +</xbel>`; + +xml = xml.replace(/highestId :(\d+):/, `highestId :${newId}:`); +xml = xml.replace('</xbel>', newBookmark); + +// Encrypt and save +const newCiphertext = encrypt(xml, PASSWORD, data.salt); +fs.writeFileSync(BOOKMARKS_PATH, JSON.stringify({ ciphertext: newCiphertext, salt: data.salt })); + +console.log(`Added bookmark: ${title}`); +console.log(`URL: ${url}`); +console.log(`ID: ${newId}`); diff --git a/add_bookmark_to_wishlist.js b/add_bookmark_to_wishlist.js new file mode 100755 index 0000000..a488174 --- /dev/null +++ b/add_bookmark_to_wishlist.js @@ -0,0 +1,97 @@ +#!/usr/bin/env node +// Add a bookmark to Music/Wishlist folder in Floccus bookmarks +// Usage: node add_bookmark_to_wishlist.js <url> <title> + +const crypto = require('crypto'); +const fs = require('fs'); + +const BOOKMARKS_PATH = '/var/www/webdav/bookmarks.xbel'; +const PASSWORD = process.env.BOOKMARKS_PASSWORD || JSON.parse(require('fs').readFileSync('/etc/automation/bookmarks.json', 'utf8')).password; +const WISHLIST_FOLDER_TITLE = 'Wishlist'; + +const url = process.argv[2]; +const title = process.argv[3]; + +if (!url || !title) { + console.error('Usage: node add_bookmark_to_wishlist.js <url> <title>'); + process.exit(1); +} + +function decrypt(data, password) { + const ciphertext = Buffer.from(data.ciphertext, 'base64'); + const salt = data.salt; + const key = crypto.pbkdf2Sync(password, salt, 250000, 32, 'sha256'); + const iv = ciphertext.slice(0, 16); + const encrypted = ciphertext.slice(16, -16); + const tag = ciphertext.slice(-16); + + const decipher = crypto.createDecipheriv('aes-256-gcm', key, iv); + decipher.setAuthTag(tag); + return Buffer.concat([decipher.update(encrypted), decipher.final()]).toString('utf8'); +} + +function encrypt(plaintext, password, salt) { + const key = crypto.pbkdf2Sync(password, salt, 250000, 32, 'sha256'); + const iv = crypto.randomBytes(16); + + const cipher = crypto.createCipheriv('aes-256-gcm', key, iv); + const encrypted = Buffer.concat([cipher.update(plaintext, 'utf8'), cipher.final()]); + const tag = cipher.getAuthTag(); + + return Buffer.concat([iv, encrypted, tag]).toString('base64'); +} + +const escapeXml = (str) => str + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + +try { + const data = JSON.parse(fs.readFileSync(BOOKMARKS_PATH, 'utf8')); + let xml = decrypt(data, PASSWORD); + + // Check if bookmark already exists + if (xml.includes(escapeXml(url)) || xml.includes(url)) { + console.log('Bookmark already exists'); + process.exit(0); + } + + // Find highest ID + const idMatch = xml.match(/highestId :(\d+):/); + let highestId = idMatch ? parseInt(idMatch[1]) : 100; + const newId = highestId + 1; + + // Find the Wishlist folder and add bookmark inside it + const wishlistPattern = /<folder id="\d+">\s*<title>Wishlist<\/title>/; + + if (wishlistPattern.test(xml)) { + // Add bookmark inside existing Wishlist folder + const newBookmark = `<bookmark href="${escapeXml(url)}" id="${newId}"> + <title>${escapeXml(title)}</title> + </bookmark> + </folder>`; + + // Find Wishlist folder's closing tag and insert before it + xml = xml.replace( + /(<folder id="\d+">\s*<title>Wishlist<\/title>[\s\S]*?)(<\/folder>)/, + (match, folderContent, closingTag) => folderContent + newBookmark + ); + } else { + console.error('Wishlist folder not found'); + process.exit(1); + } + + // Update highest ID + xml = xml.replace(/highestId :(\d+):/, `highestId :${newId}:`); + + // Save + const newCiphertext = encrypt(xml, PASSWORD, data.salt); + fs.writeFileSync(BOOKMARKS_PATH, JSON.stringify({ ciphertext: newCiphertext, salt: data.salt })); + + console.log(`Added: ${title}`); +} catch (e) { + console.error('Error:', e.message); + process.exit(1); +} diff --git a/decrypt_bookmarks.js b/decrypt_bookmarks.js new file mode 100644 index 0000000..e73c4e4 --- /dev/null +++ b/decrypt_bookmarks.js @@ -0,0 +1,34 @@ +#!/usr/bin/env node +// Decrypt Floccus bookmarks +// Usage: node decrypt_bookmarks.js [password] + +const crypto = require('crypto'); +const fs = require('fs'); + +const BOOKMARKS_PATH = '/var/www/webdav/bookmarks.xbel'; +const password = process.argv[2] || process.env.BOOKMARKS_PASSWORD || JSON.parse(require('fs').readFileSync('/etc/automation/bookmarks.json', 'utf8')).password; + +const data = JSON.parse(fs.readFileSync(BOOKMARKS_PATH, 'utf8')); +const ciphertext = Buffer.from(data.ciphertext, 'base64'); +const salt = data.salt; // Floccus uses salt as UTF-8 string, not hex-decoded + +// Floccus encryption: PBKDF2-SHA256, 250000 iterations +const key = crypto.pbkdf2Sync(password, salt, 250000, 32, 'sha256'); + +// 16-byte IV at start, then ciphertext, then 16-byte GCM tag at end +const iv = ciphertext.slice(0, 16); +const encrypted = ciphertext.slice(16, -16); +const tag = ciphertext.slice(-16); + +try { + const decipher = crypto.createDecipheriv('aes-256-gcm', key, iv); + decipher.setAuthTag(tag); + + let decrypted = decipher.update(encrypted); + decrypted = Buffer.concat([decrypted, decipher.final()]); + + console.log(decrypted.toString('utf8')); +} catch (e) { + console.error('Decryption failed:', e.message); + process.exit(1); +} diff --git a/generate_stats_page.py b/generate_stats_page.py new file mode 100644 index 0000000..efbd423 --- /dev/null +++ b/generate_stats_page.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 +"""Generate a retro stats page for Susan. + +90s CS professor homepage aesthetic. Pure HTML, no JS. +Regenerate periodically via cron. +""" + +import datetime +import os +import subprocess +import re + +OUTPUT = "/var/www/webdav/obsidian/stats.html" + +def cmd(c): + try: + return subprocess.check_output(c, shell=True, stderr=subprocess.DEVNULL, timeout=10).decode().strip() + except: + return "" + +def get_uptime(): + up = cmd("uptime -p") + return up.replace("up ", "") if up else "unknown" + +def get_uptime_since(): + return cmd("uptime -s") + +def get_load(): + load = cmd("cat /proc/loadavg") + return load.split()[:3] if load else ["?", "?", "?"] + +def get_memory(): + mem = cmd("free -h | grep Mem") + parts = mem.split() + if len(parts) >= 7: + return {"total": parts[1], "used": parts[2], "free": parts[3], "available": parts[6]} + return {"total": "?", "used": "?", "free": "?", "available": "?"} + +def get_disk(): + disks = [] + for line in cmd("df -h /disks /home 2>/dev/null").split("\n")[1:]: + parts = line.split() + if len(parts) >= 6: + disks.append({"fs": parts[0], "size": parts[1], "used": parts[2], "avail": parts[3], "pct": parts[4], "mount": parts[5]}) + return disks + +def get_services(): + services = [] + lines = cmd("systemctl list-units --type=service --state=running --no-pager --no-legend").split("\n") + targets = ["jellyfin", "navidrome", "qbittorrent", "sonarr", "radarr", "lidarr", + "readarr", "prowlarr", "slskd", "nginx", "audiobookshelf", "openclaw-gateway"] + for line in lines: + for t in targets: + if t in line.lower(): + name = line.split()[0].replace(".service", "") + services.append(name) + return sorted(services) + +def get_media_counts(): + films = int(cmd("find /disks/Plex/Films -maxdepth 1 -type d | wc -l") or 1) - 1 + tv = int(cmd("find /disks/Plex/TV -maxdepth 1 -type d | wc -l") or 1) - 1 + anime = int(cmd("find /disks/Plex/Anime -maxdepth 1 -type d | wc -l") or 1) - 1 + tracks = int(cmd("find /disks/Plex/Music -type f \\( -name '*.flac' -o -name '*.mp3' -o -name '*.ogg' -o -name '*.opus' \\) | wc -l") or 0) + artists = int(cmd("ls -1d /disks/Plex/Music/*/ 2>/dev/null | grep -v -E 'venv|lib|bin|data|include|_|pyvenv' | wc -l") or 0) + return {"films": films, "tv": tv, "anime": anime, "tracks": tracks, "artists": artists} + +def get_cpu(): + model = cmd("grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2").strip() + cores = cmd("nproc") + return model, cores + +def get_packages(): + return cmd("dpkg -l | grep '^ii' | wc -l") + +def get_kernel(): + return cmd("uname -r") + +def get_os(): + return cmd("grep PRETTY_NAME /etc/os-release | cut -d'\"' -f2") + +def get_install_date(): + d = cmd("stat -c %w /") + if d and d != "-": + return d.split(".")[0] + return "unknown" + +COUNTER_FILE = os.path.join(os.path.dirname(__file__), "..", "data", "visitor_counter.txt") + +def get_and_increment_counter(): + """Read and increment a persistent visitor counter.""" + os.makedirs(os.path.dirname(COUNTER_FILE), exist_ok=True) + count = 0 + if os.path.exists(COUNTER_FILE): + try: + count = int(open(COUNTER_FILE).read().strip()) + except: + count = 0 + count += 1 + with open(COUNTER_FILE, "w") as f: + f.write(str(count)) + return count + +now = datetime.datetime.now() +uptime = get_uptime() +uptime_since = get_uptime_since() +load = get_load() +mem = get_memory() +disks = get_disk() +services = get_services() +media = get_media_counts() +cpu_model, cpu_cores = get_cpu() +packages = get_packages() +kernel = get_kernel() +os_name = get_os() +install_date = get_install_date() +visitor_count = get_and_increment_counter() + +html = f"""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +<meta http-equiv="refresh" content="300"> +<title>Susan - System Status</title> +<style type="text/css"> +body {{ + background-color: #e8e8e8; + background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAMklEQVQYV2N89+7dfwYGBgZGRkYGBgYmBjIBE7mKR5UOPIUMDIyMjGQrHlU68BQCABPnC/0ZkzYUAAAAAElFTkSuQmCC"); + font-family: "Times New Roman", Times, serif; + color: #333; + margin: 20px 40px; + font-size: 14pt; +}} +h1 {{ + font-size: 22pt; + color: #003366; + border-bottom: 2px solid #003366; + padding-bottom: 4px; +}} +h2 {{ + font-size: 16pt; + color: #003366; + margin-top: 24px; +}} +table {{ + border-collapse: collapse; + margin: 8px 0; +}} +td, th {{ + border: 2px solid #003366; + padding: 4px 10px; + text-align: left; + background-color: #f5f5f0; +}} +th {{ + background-color: #d0d0c8; + font-weight: bold; + color: #003366; +}} +a {{ + color: #003366; +}} +hr {{ + border: none; + border-top: 2px solid #003366; + margin: 16px 0; +}} +.footer {{ + margin-top: 30px; + font-size: 10pt; + color: #666; + border-top: 1px solid #999; + padding-top: 6px; +}} +.status-ok {{ + color: green; + font-weight: bold; +}} +.status-warn {{ + color: #cc7700; + font-weight: bold; +}} +.counter {{ + font-family: "Courier New", monospace; + background-color: #000; + color: #0f0; + padding: 2px 6px; + font-size: 10pt; +}} +</style> +</head> +<body> + +<h1>Susan — System Status</h1> + +<hr> + +<h2>General Information</h2> + +<table> +<tr><th>Hostname</th><td>susan</td></tr> +<tr><th>Operating System</th><td>{os_name}</td></tr> +<tr><th>Kernel</th><td>{kernel}</td></tr> +<tr><th>Processor</th><td>{cpu_model} ({cpu_cores} cores)</td></tr> +<tr><th>Installed Packages</th><td>{packages}</td></tr> +<tr><th>First Installed</th><td>{install_date}</td></tr> +</table> + +<h2>Uptime & Load</h2> + +<table> +<tr><th>Uptime</th><td>{uptime}</td></tr> +<tr><th>Up Since</th><td>{uptime_since}</td></tr> +<tr><th>Load Average</th><td>{load[0]}, {load[1]}, {load[2]} (1, 5, 15 min)</td></tr> +</table> + +<h2>Memory</h2> + +<table> +<tr><th>Total</th><th>Used</th><th>Free</th><th>Available</th></tr> +<tr><td>{mem['total']}</td><td>{mem['used']}</td><td>{mem['free']}</td><td>{mem['available']}</td></tr> +</table> + +<h2>Disk Usage</h2> + +<table> +<tr><th>Mount</th><th>Size</th><th>Used</th><th>Available</th><th>Use%</th></tr> +""" + +for d in disks: + pct = int(d["pct"].replace("%", "")) if d["pct"].replace("%", "").isdigit() else 0 + cls = "status-warn" if pct > 85 else "status-ok" + html += f'<tr><td>{d["mount"]}</td><td>{d["size"]}</td><td>{d["used"]}</td><td>{d["avail"]}</td><td class="{cls}">{d["pct"]}</td></tr>\n' + +html += f"""</table> + +<h2>Running Services</h2> + +<table> +<tr><th>Service</th><th>Status</th></tr> +""" + +for s in services: + html += f'<tr><td>{s}</td><td class="status-ok">running</td></tr>\n' + +html += f"""</table> + +<h2>Media Library</h2> + +<table> +<tr><th>Category</th><th>Count</th></tr> +<tr><td>Films</td><td>{media['films']}</td></tr> +<tr><td>TV Shows</td><td>{media['tv']}</td></tr> +<tr><td>Anime</td><td>{media['anime']}</td></tr> +<tr><td>Music Artists</td><td>{media['artists']}</td></tr> +<tr><td>Music Tracks</td><td>{media['tracks']}</td></tr> +</table> + +<hr> + +<div class="footer"> +<p> +You are visitor number <span class="counter">{visitor_count:06,}</span> since December 2023. +</p> +<p> +This page was last generated on <b>{now.strftime("%A, %d %B %Y at %H:%M:%S")}</b>. +<br> +No JavaScript was harmed in the making of this page. +<br> +Best viewed with any browser. Optimised for 800×600. +</p> +</div> + +</body> +</html> +""" + +os.makedirs(os.path.dirname(OUTPUT), exist_ok=True) +with open(OUTPUT, "w") as f: + f.write(html) + +print(f"Generated stats page: {OUTPUT}") +print(f" Uptime: {uptime}") +print(f" Services: {len(services)}") +print(f" Films: {media['films']}, TV: {media['tv']}, Anime: {media['anime']}") +print(f" Music: {media['tracks']} tracks, {media['artists']} artists") diff --git a/import_music.sh b/import_music.sh new file mode 100755 index 0000000..bf6c1aa --- /dev/null +++ b/import_music.sh @@ -0,0 +1,131 @@ +#!/bin/bash +# Import, transcode, and organize music from slskd downloads +# Usage: ./import_music.sh [--dry-run] [--keep-flac] + +INGEST_DIR="/disks/Plex/Music/_ingest" +LIBRARY_DIR="/disks/Plex/Music" +GROUP="mediaserver" +BITRATE="128k" + +DRY_RUN=false +KEEP_FLAC=false + +for arg in "$@"; do + case $arg in + --dry-run) DRY_RUN=true ;; + --keep-flac) KEEP_FLAC=true ;; + esac +done + +echo "=== Music Import Started ===" +$DRY_RUN && echo "DRY RUN MODE" + +# Skip these directories +shopt -s extglob + +cd "$INGEST_DIR" || exit 1 + +for album_dir in */; do + [[ ! -d "$album_dir" ]] && continue + dir_name="${album_dir%/}" + + # Skip special dirs + [[ "$dir_name" == "incomplete" ]] && continue + [[ "$dir_name" == "downloads" ]] && continue + + # Count audio files + audio_count=$(ls -1 "$album_dir"*.flac "$album_dir"*.mp3 "$album_dir"*.opus 2>/dev/null | wc -l) + [[ $audio_count -lt 2 ]] && continue + + echo "" + echo "Processing: $dir_name ($audio_count files)" + + # Get first audio file for metadata + first_file=$(ls "$album_dir"*.flac "$album_dir"*.mp3 2>/dev/null | head -1) + [[ -z "$first_file" ]] && continue + + # Extract metadata + artist=$(ffprobe -v quiet -show_entries format_tags=artist -of default=noprint_wrappers=1:nokey=1 "$first_file" 2>/dev/null) + album_artist=$(ffprobe -v quiet -show_entries format_tags=album_artist -of default=noprint_wrappers=1:nokey=1 "$first_file" 2>/dev/null) + album=$(ffprobe -v quiet -show_entries format_tags=album -of default=noprint_wrappers=1:nokey=1 "$first_file" 2>/dev/null) + + # Prefer album_artist over artist + [[ -n "$album_artist" ]] && artist="$album_artist" + + # Clean for filesystem + artist=$(echo "$artist" | tr -d '<>:"/\\|?*' | sed 's/\.$//') + album=$(echo "$album" | tr -d '<>:"/\\|?*' | sed 's/\.$//') + + if [[ -z "$artist" ]] || [[ -z "$album" ]]; then + echo " ⚠ Missing metadata, skipping" + continue + fi + + dest_dir="$LIBRARY_DIR/$artist/$album" + echo " → $artist / $album" + + if $DRY_RUN; then + echo " [DRY RUN] Would transcode to $dest_dir" + continue + fi + + mkdir -p "$dest_dir" + + # Transcode FLACs + for flac in "$album_dir"*.flac "$album_dir"*.FLAC; do + [[ ! -f "$flac" ]] && continue + + base=$(basename "$flac") + # Remove .flac extension properly + name="${base%.[Ff][Ll][Aa][Cc]}" + opus_out="$dest_dir/${name}.opus" + + echo " Transcoding: $name" + ffmpeg -hide_banner -loglevel error -i "$flac" \ + -c:a libopus -b:a "$BITRATE" -vbr on \ + -map_metadata 0 -y "$opus_out" + + [[ -f "$opus_out" ]] && chgrp "$GROUP" "$opus_out" 2>/dev/null + done + + # Remove any FLACs from destination (we only want Opus there) + for flac in "$dest_dir"/*.flac "$dest_dir"/*.FLAC; do + [[ -f "$flac" ]] && rm "$flac" + done + + # Copy non-FLAC audio + for audio in "$album_dir"*.mp3 "$album_dir"*.m4a "$album_dir"*.ogg; do + [[ -f "$audio" ]] && cp "$audio" "$dest_dir/" + done + + # Copy artwork + for art in "$album_dir"*.jpg "$album_dir"*.png "$album_dir"cover.* "$album_dir"folder.*; do + [[ -f "$art" ]] && cp "$art" "$dest_dir/" + done + + # Fix permissions + chgrp -R "$GROUP" "$dest_dir" 2>/dev/null + chmod -R g+rw "$dest_dir" 2>/dev/null + + # Run beets for tagging + echo " Running beets..." + beet import -q "$dest_dir" 2>/dev/null + if [[ $? -eq 0 ]]; then + echo " ✓ Beets tagging complete" + else + echo " ⚠ Beets tagging skipped or failed (check manually)" + fi + + # Verify + new_count=$(ls -1 "$dest_dir"/*.opus "$dest_dir"/*.mp3 2>/dev/null | wc -l) + if [[ $new_count -ge 3 ]]; then + echo " ✓ Imported $new_count files" + rm -rf "$INGEST_DIR/$dir_name" + echo " ✓ Cleaned source" + else + echo " ✗ Verification failed" + fi +done + +echo "" +echo "=== Done ===" diff --git a/morning_report.py b/morning_report.py new file mode 100644 index 0000000..a1e8103 --- /dev/null +++ b/morning_report.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python3 +""" +Susan Morning Report +Generates a system health report and emails it to Tom. +Run via cron at 06:45 (after transcoder finishes). +""" + +import subprocess +import json +import os +import re +import smtplib +from email.mime.text import MIMEText +from email.mime.multipart import MIMEMultipart +from datetime import datetime, timedelta +from pathlib import Path + +# Config +EMAIL_CONFIG = "/etc/susan/email.json" +TRANSCODER_LOG = "/var/log/transcoder.log" +TRANSCODER_DB = "/var/lib/transcoder/cache.db" +TO_EMAIL = "tom@tomflux.xyz" + + +def run_cmd(cmd: str, timeout: int = 30) -> str: + """Run a shell command and return output.""" + try: + result = subprocess.run( + cmd, shell=True, capture_output=True, text=True, timeout=timeout + ) + return result.stdout.strip() + except Exception as e: + return f"Error: {e}" + + +def get_uptime() -> dict: + """Get system uptime info.""" + uptime_raw = run_cmd("uptime -p") + load = run_cmd("cat /proc/loadavg").split()[:3] + boot_time = run_cmd("uptime -s") + + return { + "uptime": uptime_raw.replace("up ", ""), + "since": boot_time, + "load_avg": f"{load[0]} / {load[1]} / {load[2]}" if len(load) >= 3 else "unknown" + } + + +def get_disk_space() -> list: + """Get disk usage for important mounts.""" + disks = [] + # Only show root and /disks* mounts, exclude virtual filesystems + df_output = run_cmd("df -h --output=target,size,used,avail,pcent -x tmpfs -x devtmpfs -x squashfs | tail -n +2") + + for line in df_output.split('\n'): + if line.strip(): + parts = line.split() + if len(parts) >= 5: + mount = parts[0] + # Only include / and /disks*, /home, /var + if mount in ['/', '/home', '/var'] or mount.startswith('/disks'): + disks.append({ + "mount": mount, + "size": parts[1], + "used": parts[2], + "avail": parts[3], + "percent": parts[4] + }) + return disks + + +def get_raid_status() -> dict: + """Get 3ware RAID status using tw_cli.""" + status = {"available": False, "details": None, "drives": [], "array": None} + + # Try tw_cli first + tw_output = run_cmd("sudo tw_cli /c0 show 2>/dev/null") + if "Error" not in tw_output and tw_output and "Unit" in tw_output: + status["available"] = True + status["details"] = tw_output + + # Parse array status + for line in tw_output.split('\n'): + if line.startswith('u0'): + parts = line.split() + if len(parts) >= 3: + status["array"] = { + "unit": parts[0], + "type": parts[1], + "status": parts[2], + "verify": parts[4] if len(parts) > 4 and '%' in parts[4] else None + } + # Parse drive lines (p4, p5, p6, etc) + if line.startswith('p'): + parts = line.split() + if len(parts) >= 3: + status["drives"].append({ + "port": parts[0], + "status": parts[1], + "size": parts[3] if len(parts) > 3 else "unknown" + }) + else: + # Fallback to checking for mdadm + md_output = run_cmd("cat /proc/mdstat 2>/dev/null") + if md_output and "Error" not in md_output and "md" in md_output: + status["available"] = True + status["details"] = md_output + status["type"] = "mdadm" + + return status + + +def get_smart_status() -> list: + """Get SMART status for drives. Note: requires sudo for smartctl.""" + drives = [] + + # Find physical block devices (skip loop, ram, etc) + lsblk = run_cmd("lsblk -d -o NAME,SIZE,TYPE | grep disk | grep -v loop") + for line in lsblk.split('\n'): + if line.strip(): + parts = line.split() + if parts and not parts[0].startswith('loop'): + dev = f"/dev/{parts[0]}" + size = parts[1] if len(parts) > 1 else "unknown" + + # Try smartctl with sudo + smart = run_cmd(f"sudo smartctl -H {dev} 2>/dev/null | grep -iE 'overall-health|result|PASSED|FAILED'") + reallocated = run_cmd(f"sudo smartctl -A {dev} 2>/dev/null | grep -i 'Reallocated_Sector'") + + # Determine health status + if "PASSED" in smart: + health = "PASSED" + elif "FAILED" in smart: + health = "FAILED" + elif "sudo:" in smart or not smart: + health = "needs sudo" + elif "Unable to detect" in run_cmd(f"sudo smartctl -i {dev} 2>&1"): + health = "RAID array (skip)" + else: + health = "unknown" + + # Skip RAID virtual devices (they show as large and can't be queried) + if health == "RAID array (skip)": + continue + + # Also skip if smartctl says it's not a physical device + if "unknown" in health and "T" in size: # Multi-TB device with unknown = likely RAID + continue + + drive_info = { + "device": dev, + "size": size, + "health": health, + } + + if reallocated: + # Extract reallocated sector count (last number on line) + match = re.search(r'(\d+)\s*$', reallocated.strip()) + if match: + drive_info["reallocated_sectors"] = int(match.group(1)) + + drives.append(drive_info) + + return drives + + +def get_memory() -> dict: + """Get memory usage.""" + mem_output = run_cmd("free -h | grep Mem") + parts = mem_output.split() + + if len(parts) >= 4: + return { + "total": parts[1], + "used": parts[2], + "available": parts[6] if len(parts) > 6 else parts[3] + } + return {"total": "unknown", "used": "unknown", "available": "unknown"} + + +def get_cpu_info() -> dict: + """Get CPU info and temperature.""" + info = {} + + # CPU model + model = run_cmd("grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2") + info["model"] = model.strip() if model else "unknown" + + # Temperatures + temps = run_cmd("sensors 2>/dev/null | grep -E 'Core|temp' | head -5") + info["temps"] = temps if temps else "sensors not available" + + return info + + +def get_transcoder_status() -> dict: + """Get last night's transcoder results + queue stats from DB.""" + import sqlite3 + + status = {"ran": False, "summary": None, "queue": None} + + # Get queue stats from database + db_path = Path(TRANSCODER_DB) + if db_path.exists(): + try: + conn = sqlite3.connect(TRANSCODER_DB) + + # Pending files + cursor = conn.execute(""" + SELECT COUNT(*), SUM(original_size) + FROM files WHERE is_hevc = 0 AND status = 'pending' + """) + row = cursor.fetchone() + pending_count = row[0] or 0 + pending_size = row[1] or 0 + + # Already HEVC + cursor = conn.execute("SELECT COUNT(*) FROM files WHERE is_hevc = 1") + hevc_count = cursor.fetchone()[0] or 0 + + # Lifetime stats + cursor = conn.execute("SELECT total_files_transcoded, total_space_saved FROM stats WHERE id = 1") + row = cursor.fetchone() + lifetime_transcoded = row[0] if row else 0 + lifetime_saved = row[1] if row else 0 + + # Failed count + cursor = conn.execute("SELECT COUNT(*) FROM files WHERE status = 'failed'") + failed_count = cursor.fetchone()[0] or 0 + + conn.close() + + status["queue"] = { + "pending_count": pending_count, + "pending_size": pending_size, + "pending_size_human": f"{pending_size / (1024**3):.1f} GB" if pending_size else "0 GB", + "hevc_count": hevc_count, + "failed_count": failed_count, + "lifetime_transcoded": lifetime_transcoded, + "lifetime_saved": lifetime_saved, + "lifetime_saved_human": f"{lifetime_saved / (1024**3):.1f} GB" if lifetime_saved else "0 GB" + } + except Exception as e: + status["queue"] = {"error": str(e)} + + # Check log for last night's run + log_path = Path(TRANSCODER_LOG) + if log_path.exists(): + try: + log_content = log_path.read_text() + except PermissionError: + log_content = run_cmd(f"sudo cat {TRANSCODER_LOG} 2>/dev/null") + if "Error" in log_content: + log_content = "" + + if log_content: + # Find the last SESSION COMPLETE block + sessions = re.findall( + r'SESSION COMPLETE.*?Transcoded:\s+(\d+).*?Failed:\s+(\d+).*?Space saved:\s+([\d.]+\s+\w+)', + log_content, re.DOTALL + ) + + if sessions: + last = sessions[-1] + status["ran"] = True + status["transcoded"] = int(last[0]) + status["failed"] = int(last[1]) + status["space_saved"] = last[2] + status["summary"] = f"{last[0]} transcoded, {last[1]} failed, {last[2]} saved" + else: + today = datetime.now().strftime("%Y-%m-%d") + yesterday = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") + if today in log_content or yesterday in log_content: + status["ran"] = True + status["summary"] = "Ran (check log for details)" + + return status + + +def get_failed_services() -> list: + """Get any failed systemd services.""" + output = run_cmd("systemctl --failed --no-pager --plain | grep -E '\.service|\.socket|\.mount' | awk '{print $1}'") + failed = [] + for line in output.split('\n'): + name = line.strip() + if name and not name.startswith('UNIT') and '●' not in name: + failed.append(name) + return failed + + +def generate_report() -> str: + """Generate the full morning report.""" + now = datetime.now() + + report = [] + report.append("=" * 50) + report.append(f"🖥️ SUSAN MORNING REPORT") + report.append(f"📅 {now.strftime('%A, %B %d %Y at %H:%M')}") + report.append("=" * 50) + report.append("") + + # Uptime + uptime = get_uptime() + report.append("⏱️ UPTIME") + report.append(f" Up: {uptime['uptime']}") + report.append(f" Since: {uptime['since']}") + report.append(f" Load: {uptime['load_avg']}") + report.append("") + + # Memory + mem = get_memory() + report.append("🧠 MEMORY") + report.append(f" Used: {mem['used']} / {mem['total']}") + report.append(f" Available: {mem['available']}") + report.append("") + + # Disk Space + disks = get_disk_space() + report.append("💾 DISK SPACE") + for disk in disks: + warn = " ⚠️" if disk['percent'].replace('%', '').isdigit() and int(disk['percent'].replace('%', '')) > 85 else "" + report.append(f" {disk['mount']}: {disk['used']}/{disk['size']} ({disk['percent']} used){warn}") + report.append("") + + # RAID Status + raid = get_raid_status() + report.append("🔒 RAID STATUS") + if raid["available"]: + if raid.get("array"): + arr = raid["array"] + status_icon = "✅" if arr["status"] == "OK" else ("⚠️" if "VERIFY" in arr["status"] else "❌") + report.append(f" Array: {arr['type']} {status_icon} {arr['status']}") + if arr.get("verify"): + report.append(f" Verify progress: {arr['verify']}") + + if raid.get("drives"): + report.append(f" Drives: {len(raid['drives'])} disks") + for drive in raid["drives"]: + d_icon = "✅" if drive["status"] == "OK" else "❌" + report.append(f" {drive['port']}: {d_icon} {drive['status']} ({drive['size']} TB)") + else: + report.append(" tw_cli not available - install 3ware tools for RAID monitoring") + report.append("") + + # Drive Health (SMART) + drives = get_smart_status() + if drives: + report.append("🔧 DRIVE HEALTH (SMART)") + for drive in drives: + warn = "" + if drive.get("reallocated_sectors", 0) > 0: + warn = f" ⚠️ {drive['reallocated_sectors']} reallocated sectors!" + health_icon = "✅" if drive["health"] == "PASSED" else "❌" + report.append(f" {drive['device']} ({drive['size']}): {health_icon} {drive['health']}{warn}") + report.append("") + + # Transcoder + transcoder = get_transcoder_status() + report.append("🎬 TRANSCODER") + + # Last night's run + if transcoder.get("ran"): + report.append(f" Last run: ✅ {transcoder.get('summary', 'Completed')}") + elif transcoder.get("summary"): + report.append(f" Last run: ⏸️ {transcoder['summary']}") + else: + report.append(f" Last run: No data") + + # Queue stats from database + if transcoder.get("queue") and not transcoder["queue"].get("error"): + q = transcoder["queue"] + report.append(f" Queue: {q['pending_count']} files ({q['pending_size_human']}) waiting") + if q['failed_count'] > 0: + report.append(f" ⚠️ Failed: {q['failed_count']} files need attention") + report.append(f" Library: {q['hevc_count']} files already HEVC") + if q['lifetime_transcoded'] > 0: + report.append(f" Lifetime: {q['lifetime_transcoded']} transcoded, {q['lifetime_saved_human']} saved") + elif transcoder.get("queue", {}).get("error"): + report.append(f" DB error: {transcoder['queue']['error']}") + + report.append("") + + # Failed Services + failed = get_failed_services() + report.append("🚨 SYSTEMD SERVICES") + if failed: + report.append(f" ❌ {len(failed)} failed: {', '.join(failed)}") + else: + report.append(" ✅ All services OK") + report.append("") + + report.append("=" * 50) + report.append("End of report") + report.append("=" * 50) + + return "\n".join(report) + + +def send_email(subject: str, body: str): + """Send the report via email.""" + with open(EMAIL_CONFIG) as f: + cfg = json.load(f) + + smtp = cfg['smtp'] + + msg = MIMEMultipart() + msg['Subject'] = subject + msg['From'] = smtp['from'] + msg['To'] = TO_EMAIL + + # Plain text version + msg.attach(MIMEText(body, 'plain')) + + server = smtplib.SMTP_SSL(smtp['server'], smtp['port']) + server.login(smtp['username'], smtp['password']) + server.sendmail(smtp['from'], [TO_EMAIL], msg.as_string()) + server.quit() + + +def main(): + report = generate_report() + + # Print to stdout (for logging) + print(report) + + # Email it + today = datetime.now().strftime("%Y-%m-%d") + subject = f"🖥️ Susan Morning Report - {today}" + + try: + send_email(subject, report) + print("\n✅ Report emailed successfully") + except Exception as e: + print(f"\n❌ Failed to send email: {e}") + + +if __name__ == "__main__": + main() diff --git a/music_config.example.json b/music_config.example.json new file mode 100644 index 0000000..ac8c622 --- /dev/null +++ b/music_config.example.json @@ -0,0 +1,15 @@ +{ + "lastfm": { + "api_key": "YOUR_LASTFM_API_KEY", + "username": "tomflux" + }, + "navidrome": { + "url": "https://navi.jihakuz.xyz", + "username": "YOUR_NAVIDROME_USERNAME", + "password": "YOUR_NAVIDROME_PASSWORD" + }, + "slskd": { + "url": "https://music.jihakuz.xyz", + "api_key": "YOUR_SLSKD_API_KEY" + } +} diff --git a/music_recommender.py b/music_recommender.py new file mode 100755 index 0000000..a366f9a --- /dev/null +++ b/music_recommender.py @@ -0,0 +1,1038 @@ +#!/usr/bin/env python3 +""" +Music Recommender - Finds music you might like based on Last.fm history, +cross-referenced with your Navidrome collection. Can auto-download via slskd. + +Usage: + python music_recommender.py recommend # Get recommendations + python music_recommender.py gaps # Find library gaps + python music_recommender.py recent # Recent scrobbles + python music_recommender.py search "query" # Search Navidrome + python music_recommender.py surprise # Pick & download an album overnight + python music_recommender.py slskd "query" # Search Soulseek + python music_recommender.py download "artist - album" # Download from Soulseek +""" + +import argparse +import hashlib +import json +import os +import random +import re +import string +import sys +import time +from datetime import datetime, timedelta +from pathlib import Path +from typing import Optional +from urllib.parse import urlencode + +import requests + +# Default config path +CONFIG_PATH = Path(__file__).parent / "music_config.json" +STATE_PATH = Path(__file__).parent / "music_state.json" + + +class LastFM: + """Last.fm API client.""" + + BASE_URL = "https://ws.audioscrobbler.com/2.0/" + + def __init__(self, api_key: str, username: str): + self.api_key = api_key + self.username = username + + def _request(self, method: str, **params) -> dict: + params.update({ + "method": method, + "api_key": self.api_key, + "format": "json", + }) + resp = requests.get(self.BASE_URL, params=params) + resp.raise_for_status() + return resp.json() + + def get_recent_tracks(self, limit: int = 50) -> list[dict]: + """Get recently played tracks.""" + data = self._request("user.getrecenttracks", user=self.username, limit=limit) + tracks = data.get("recenttracks", {}).get("track", []) + return [ + { + "artist": t.get("artist", {}).get("#text", ""), + "album": t.get("album", {}).get("#text", ""), + "track": t.get("name", ""), + "now_playing": "@attr" in t and t["@attr"].get("nowplaying") == "true", + } + for t in tracks + ] + + def get_top_artists(self, period: str = "3month", limit: int = 50) -> list[dict]: + """Get top artists. Period: overall, 7day, 1month, 3month, 6month, 12month.""" + data = self._request("user.gettopartists", user=self.username, period=period, limit=limit) + artists = data.get("topartists", {}).get("artist", []) + return [ + { + "name": a.get("name", ""), + "playcount": int(a.get("playcount", 0)), + "url": a.get("url", ""), + } + for a in artists + ] + + def get_similar_artists(self, artist: str, limit: int = 10) -> list[dict]: + """Get artists similar to the given artist.""" + try: + data = self._request("artist.getsimilar", artist=artist, limit=limit) + similar = data.get("similarartists", {}).get("artist", []) + return [ + { + "name": a.get("name", ""), + "match": float(a.get("match", 0)), + "url": a.get("url", ""), + } + for a in similar + ] + except Exception: + return [] + + def get_artist_top_albums(self, artist: str, limit: int = 5) -> list[dict]: + """Get top albums for an artist.""" + try: + data = self._request("artist.gettopalbums", artist=artist, limit=limit) + albums = data.get("topalbums", {}).get("album", []) + return [ + { + "name": a.get("name", ""), + "artist": artist, + "playcount": int(a.get("playcount", 0)), + "url": a.get("url", ""), + } + for a in albums + if a.get("name") and a.get("name") != "(null)" + ] + except Exception: + return [] + + def get_recommendations(self, limit: int = 20) -> list[dict]: + """Get recommended artists based on listening history.""" + top = self.get_top_artists(period="3month", limit=10) + + recommendations = {} + for artist in top: + similar = self.get_similar_artists(artist["name"], limit=5) + for s in similar: + name = s["name"] + if name not in recommendations and name not in [a["name"] for a in top]: + recommendations[name] = { + "name": name, + "because_of": artist["name"], + "match": s["match"], + } + + sorted_recs = sorted(recommendations.values(), key=lambda x: x["match"], reverse=True) + return sorted_recs[:limit] + + +class Navidrome: + """Navidrome/Subsonic API client.""" + + def __init__(self, url: str, username: str, password: str): + self.base_url = url.rstrip("/") + self.username = username + self.password = password + + def _auth_params(self) -> dict: + salt = "".join(random.choices(string.ascii_lowercase + string.digits, k=8)) + token = hashlib.md5((self.password + salt).encode()).hexdigest() + return { + "u": self.username, + "t": token, + "s": salt, + "v": "1.16.1", + "c": "music_recommender", + "f": "json", + } + + def _request(self, endpoint: str, **params) -> dict: + params.update(self._auth_params()) + url = f"{self.base_url}/rest/{endpoint}" + resp = requests.get(url, params=params) + resp.raise_for_status() + data = resp.json() + + sr = data.get("subsonic-response", {}) + if sr.get("status") == "failed": + error = sr.get("error", {}) + raise Exception(f"Subsonic error {error.get('code')}: {error.get('message')}") + + return sr + + def ping(self) -> bool: + try: + self._request("ping") + return True + except Exception: + return False + + def get_artists(self) -> list[dict]: + data = self._request("getArtists") + artists = [] + for index in data.get("artists", {}).get("index", []): + for artist in index.get("artist", []): + artists.append({ + "id": artist.get("id"), + "name": artist.get("name"), + "album_count": artist.get("albumCount", 0), + }) + return artists + + def get_artist(self, artist_id: str) -> dict: + data = self._request("getArtist", id=artist_id) + artist = data.get("artist", {}) + return { + "id": artist.get("id"), + "name": artist.get("name"), + "album_count": artist.get("albumCount", 0), + "albums": [ + { + "id": a.get("id"), + "name": a.get("name"), + "year": a.get("year"), + "song_count": a.get("songCount", 0), + } + for a in artist.get("album", []) + ], + } + + def search(self, query: str) -> dict: + data = self._request("search3", query=query) + result = data.get("searchResult3", {}) + return { + "artists": [ + {"id": a.get("id"), "name": a.get("name")} + for a in result.get("artist", []) + ], + "albums": [ + {"id": a.get("id"), "name": a.get("name"), "artist": a.get("artist")} + for a in result.get("album", []) + ], + "songs": [ + {"id": s.get("id"), "title": s.get("title"), "artist": s.get("artist"), "album": s.get("album")} + for s in result.get("song", []) + ], + } + + +class Slskd: + """slskd API client for Soulseek searches and downloads.""" + + def __init__(self, url: str, api_key: str): + self.base_url = url.rstrip("/") + self.api_key = api_key + self.headers = {"X-API-Key": api_key} + + def _request(self, method: str, endpoint: str, **kwargs) -> dict: + url = f"{self.base_url}/api/v0/{endpoint}" + kwargs.setdefault("headers", {}).update(self.headers) + resp = requests.request(method, url, **kwargs) + resp.raise_for_status() + if resp.content: + return resp.json() + return {} + + def search(self, query: str, timeout: int = 30) -> str: + """Start a search and return the search ID.""" + data = self._request("POST", "searches", json={"searchText": query}) + return data.get("id") + + def get_search_state(self, search_id: str) -> dict: + """Get search state.""" + return self._request("GET", f"searches/{search_id}") + + def wait_for_search(self, search_id: str, timeout: int = 60, min_results: int = 5, min_wait: int = 30) -> bool: + """Wait for search to complete or have enough results. + + Args: + search_id: The search ID to wait for + timeout: Maximum time to wait + min_results: Return early if we have this many files + min_wait: Minimum time to wait before honoring isComplete with 0 results + """ + start = time.time() + while time.time() - start < timeout: + state = self.get_search_state(search_id) + file_count = state.get("fileCount", 0) + elapsed = time.time() - start + + # Got enough results - done + if file_count >= min_results: + return True + + # Search complete with results - done + if state.get("isComplete") and file_count > 0: + return True + + # Search complete with NO results - only trust after min_wait + # (Soulseek can mark complete before results arrive) + if state.get("isComplete") and elapsed >= min_wait: + return True + + time.sleep(2) + return False + + def get_search_responses(self, search_id: str, max_retries: int = 15, retry_delay: float = 2) -> list[dict]: + """Get search responses (results grouped by user). + + Retries if response list is empty but state shows results exist. + slskd has a race condition where /responses can lag significantly + behind state.responseCount - sometimes 10+ seconds. + """ + for attempt in range(max_retries): + responses = self._request("GET", f"searches/{search_id}/responses") + if responses: + return responses + + # Check if state says there should be responses + state = self.get_search_state(search_id) + if state.get("responseCount", 0) == 0: + # Genuinely no results + return [] + + # State has responses but endpoint returned empty - wait and retry + time.sleep(retry_delay) + + return [] + + def download_files(self, username: str, files: list[dict]) -> dict: + """Queue files for download from a user.""" + return self._request("POST", f"transfers/downloads/{username}", json=files) + + def get_downloads(self) -> list[dict]: + """Get current downloads.""" + return self._request("GET", "transfers/downloads") + + def search_and_find_album(self, artist: str, album: str, timeout: int = 60) -> Optional[dict]: + """Search for an album and return best result.""" + query = f"{artist} {album}" + print(f" Searching Soulseek for: {query}") + + search_id = self.search(query) + if not search_id: + return None + + # Wait for results + self.wait_for_search(search_id, timeout=timeout, min_results=3) + + responses = self.get_search_responses(search_id) + if not responses: + return None + + # Score and rank results + best_match = None + best_score = 0 + + for response in responses: + username = response.get("username", "") + files = response.get("files", []) + + # Skip users with no free slots + if not response.get("hasFreeUploadSlot", False): + continue + + # Group files by directory + directories = {} + for f in files: + filename = f.get("filename", "") + # Extract directory + parts = filename.replace("\\", "/").rsplit("/", 1) + if len(parts) == 2: + dir_path, fname = parts + else: + dir_path, fname = "", parts[0] + + if dir_path not in directories: + directories[dir_path] = [] + directories[dir_path].append(f) + + # Find directories that look like albums (multiple audio files) + for dir_path, dir_files in directories.items(): + audio_files = [f for f in dir_files if any( + f.get("filename", "").lower().endswith(ext) + for ext in [".flac", ".mp3", ".m4a", ".ogg", ".opus"] + )] + + if len(audio_files) < 3: + continue + + # Score this directory + score = 0 + dir_lower = dir_path.lower() + + # Prefer FLAC + flac_count = sum(1 for f in audio_files if f.get("filename", "").lower().endswith(".flac")) + if flac_count == len(audio_files): + score += 100 + elif flac_count > 0: + score += 50 + + # Check artist/album in path + if artist.lower() in dir_lower: + score += 30 + if album.lower() in dir_lower: + score += 30 + + # Prefer more complete albums + score += min(len(audio_files), 20) + + # Prefer faster users (higher upload speed) + score += min(response.get("uploadSpeed", 0) // 100000, 20) + + if score > best_score: + best_score = score + best_match = { + "username": username, + "directory": dir_path, + "files": audio_files, + "file_count": len(audio_files), + "score": score, + "has_flac": flac_count > 0, + } + + return best_match + + def download_album(self, match: dict) -> bool: + """Download an album match.""" + if not match: + return False + + username = match["username"] + files = match["files"] + + # Format files for download API + download_files = [ + {"filename": f["filename"], "size": f.get("size", 0)} + for f in files + ] + + try: + self.download_files(username, download_files) + return True + except Exception as e: + print(f" Download failed: {e}") + return False + + def get_user_downloads(self, username: str) -> list[dict]: + """Get downloads from a specific user.""" + try: + all_downloads = self._request("GET", "transfers/downloads") + # Filter by username - strip invisible Unicode chars for robust matching + clean = lambda s: ''.join(c for c in (s or '') if c.isprintable()) + target = clean(username) + return [d for d in all_downloads if clean(d.get("username", "")) == target] + except Exception: + return [] + + def cancel_user_downloads(self, username: str) -> bool: + """Cancel all downloads from a user.""" + try: + self._request("DELETE", f"transfers/downloads/{username}") + return True + except Exception: + return False + + def wait_for_download(self, username: str, expected_files: int, + timeout_minutes: int = 30, check_interval: int = 60) -> bool: + """Wait for downloads from a user to complete.""" + import time + + start = time.time() + timeout_seconds = timeout_minutes * 60 + + while time.time() - start < timeout_seconds: + downloads = self.get_user_downloads(username) + + if not downloads: + # No downloads found - might have completed and been removed + # Or never started + return False + + # Check states + completed = sum(1 for d in downloads + for f in d.get("files", []) + if f.get("state", "").startswith("Completed")) + failed = sum(1 for d in downloads + for f in d.get("files", []) + if any(s in (f.get("state") or "") for s in ["Errored", "Cancelled", "TimedOut", "Rejected", "Aborted"])) + in_progress = sum(1 for d in downloads + for f in d.get("files", []) + if any(s in (f.get("state") or "") for s in ["Queued", "Initializing", "InProgress"])) + + total = completed + failed + in_progress + + print(f" Progress: {completed}/{total} complete, {failed} failed, {in_progress} in progress") + + if completed >= expected_files: + return True + + if failed > 0 and in_progress == 0: + # All either completed or failed, nothing in progress + return completed > 0 + + time.sleep(check_interval) + + # Timeout + return False + + +class MusicRecommender: + """Main recommender combining all sources.""" + + def __init__(self, config: dict): + self.config = config + + self.lastfm = LastFM( + api_key=config["lastfm"]["api_key"], + username=config["lastfm"]["username"], + ) + + self.navidrome = None + if "navidrome" in config: + self.navidrome = Navidrome( + url=config["navidrome"]["url"], + username=config["navidrome"]["username"], + password=config["navidrome"]["password"], + ) + + self.slskd = None + if "slskd" in config: + self.slskd = Slskd( + url=config["slskd"]["url"], + api_key=config["slskd"]["api_key"], + ) + + def get_library_artists(self) -> set[str]: + if not self.navidrome: + return set() + artists = self.navidrome.get_artists() + return {a["name"].lower() for a in artists} + + def get_library_albums(self) -> set[str]: + """Get set of 'artist - album' strings in library.""" + if not self.navidrome: + return set() + + albums = set() + artists = self.navidrome.get_artists() + for artist in artists: + try: + details = self.navidrome.get_artist(artist["id"]) + for album in details.get("albums", []): + key = f"{artist['name'].lower()} - {album['name'].lower()}" + albums.add(key) + except Exception: + pass + return albums + + def find_gaps(self) -> list[dict]: + """Find artists you listen to but don't have in your library.""" + top_artists = self.lastfm.get_top_artists(period="6month", limit=50) + library = self.get_library_artists() + + # Filter out YouTube auto-channels and other noise + noise_patterns = [" - topic", "official", "vevo", "cyber gardens"] + + gaps = [] + for artist in top_artists: + name_lower = artist["name"].lower() + + # Skip noise + if any(p in name_lower for p in noise_patterns): + continue + + if name_lower not in library: + gaps.append({ + "name": artist["name"], + "playcount": artist["playcount"], + "reason": "listened to but not in library", + }) + + return gaps + + def get_disliked_artists(self) -> set[str]: + """Get set of disliked artist names (lowercase) from config feedback.""" + disliked = set() + feedback = self.config.get("feedback", {}) + for entry in feedback.get("disliked", []): + if "artist" in entry: + disliked.add(entry["artist"].lower()) + return disliked + + def get_recommendations(self) -> list[dict]: + recs = self.lastfm.get_recommendations(limit=30) + library = self.get_library_artists() + disliked = self.get_disliked_artists() + filtered = [r for r in recs if r["name"].lower() not in library and r["name"].lower() not in disliked] + return filtered[:20] + + def pick_surprise_album(self) -> Optional[dict]: + """Pick an album to download as a surprise.""" + # Load state to avoid repeats + state = self._load_state() + downloaded = set(state.get("downloaded_albums", [])) + + # Strategy 1: Album from a gap artist (artist you listen to but don't own) + gaps = self.find_gaps()[:10] + + # Strategy 2: Album from a recommended artist + recs = self.get_recommendations()[:10] + + # Combine and shuffle + candidates = [] + + for gap in gaps: + albums = self.lastfm.get_artist_top_albums(gap["name"], limit=3) + for album in albums: + key = f"{gap['name'].lower()} - {album['name'].lower()}" + if key not in downloaded: + candidates.append({ + "artist": gap["name"], + "album": album["name"], + "reason": f"You've played {gap['name']} {gap['playcount']} times but don't own this", + "priority": 2, # Higher priority for gaps + }) + + for rec in recs: + albums = self.lastfm.get_artist_top_albums(rec["name"], limit=2) + for album in albums: + key = f"{rec['name'].lower()} - {album['name'].lower()}" + if key not in downloaded: + candidates.append({ + "artist": rec["name"], + "album": album["name"], + "reason": f"Similar to {rec['because_of']}", + "priority": 1, + }) + + if not candidates: + return None + + # Sort by priority, then shuffle within priority + candidates.sort(key=lambda x: -x["priority"]) + top_priority = candidates[0]["priority"] + top_candidates = [c for c in candidates if c["priority"] == top_priority] + + return random.choice(top_candidates) + + def surprise_download(self, max_attempts: int = 3, wait_for_completion: bool = False, + timeout_minutes: int = 30) -> dict: + """Pick and download a surprise album. Tries multiple albums if needed. + + Args: + max_attempts: Maximum albums to try + wait_for_completion: If True, wait for download to complete (for overnight job) + timeout_minutes: How long to wait for each download before giving up + """ + if not self.slskd: + return {"success": False, "error": "slskd not configured"} + + # Get all candidates + state = self._load_state() + downloaded = set(state.get("downloaded_albums", [])) + disliked = self.get_disliked_artists() + + candidates = [] + + # From gaps (higher priority) - skip disliked artists + for gap in self.find_gaps()[:15]: + if gap["name"].lower() in disliked: + continue + albums = self.lastfm.get_artist_top_albums(gap["name"], limit=3) + for album in albums: + key = f"{gap['name'].lower()} - {album['name'].lower()}" + if key not in downloaded: + candidates.append({ + "artist": gap["name"], + "album": album["name"], + "reason": f"You've played {gap['name']} {gap['playcount']} times but don't own this", + "priority": 2, + }) + + # From recommendations + for rec in self.get_recommendations()[:10]: + albums = self.lastfm.get_artist_top_albums(rec["name"], limit=2) + for album in albums: + key = f"{rec['name'].lower()} - {album['name'].lower()}" + if key not in downloaded: + candidates.append({ + "artist": rec["name"], + "album": album["name"], + "reason": f"Similar to {rec['because_of']}", + "priority": 1, + }) + + # From Discogs labels (curated 90s labels) + discogs_path = Path(__file__).parent / "discogs_labels.json" + if discogs_path.exists(): + try: + with open(discogs_path) as f: + discogs_data = json.load(f) + # Pick random releases from the pool + discogs_releases = discogs_data.get("releases", []) + random.shuffle(discogs_releases) + for release in discogs_releases[:20]: + if release['artist'].lower() in disliked: + continue + key = f"{release['artist'].lower()} - {release['album'].lower()}" + if key not in downloaded: + year_str = f" ({release['year']})" if release.get('year') else "" + candidates.append({ + "artist": release["artist"], + "album": release["album"], + "reason": f"From {release['label']}{year_str}", + "priority": 1, # Same priority as recommendations + }) + except Exception: + pass # Silently skip if file is corrupt + + if not candidates: + return {"success": False, "error": "No suitable albums found"} + + # Shuffle within priority groups + random.shuffle(candidates) + candidates.sort(key=lambda x: -x["priority"]) + + # Try up to max_attempts albums + tried = [] + for pick in candidates[:max_attempts]: + print(f"\n🎲 Trying: {pick['artist']} - {pick['album']}") + print(f" Reason: {pick['reason']}\n") + + match = self.slskd.search_and_find_album(pick["artist"], pick["album"]) + + if not match: + print(f" ❌ Not found on Soulseek, trying next...") + tried.append(f"{pick['artist']} - {pick['album']} (not found)") + continue + + print(f" Found: {match['file_count']} files from {match['username']}") + print(f" FLAC: {'Yes' if match['has_flac'] else 'No'}") + + if not self.slskd.download_album(match): + tried.append(f"{pick['artist']} - {pick['album']} (queue failed)") + continue + + print(f" ✓ Download queued!") + + # If not waiting for completion, return success after queuing + if not wait_for_completion: + self._save_downloaded(pick["artist"], pick["album"], pick.get("reason")) + return { + "success": True, + "artist": pick["artist"], + "album": pick["album"], + "reason": pick["reason"], + "files": match["file_count"], + "source": match["username"], + "has_flac": match["has_flac"], + "attempts": len(tried) + 1, + "status": "queued", + } + + # Wait for download to complete + print(f" ⏳ Waiting up to {timeout_minutes} minutes for download...") + + if self.slskd.wait_for_download(match["username"], match["file_count"], + timeout_minutes=timeout_minutes): + self._save_downloaded(pick["artist"], pick["album"], pick.get("reason")) + return { + "success": True, + "artist": pick["artist"], + "album": pick["album"], + "reason": pick["reason"], + "files": match["file_count"], + "source": match["username"], + "has_flac": match["has_flac"], + "attempts": len(tried) + 1, + "status": "completed", + } + else: + # Download timed out - cancel and try next + print(f" ⏱️ Timeout! Cancelling and trying next album...") + self.slskd.cancel_user_downloads(match["username"]) + tried.append(f"{pick['artist']} - {pick['album']} (timeout after {timeout_minutes}min)") + continue + + # Save failed attempts to wishlist for Discogs bookmarking + self._save_wishlist(tried) + + return { + "success": False, + "error": f"Tried {len(tried)} albums, none worked", + "tried": tried, + } + + def _save_wishlist(self, albums: list[str]): + """Save albums that couldn't be found to wishlist + bookmarks.""" + wishlist_path = Path(__file__).parent / "music_wishlist.md" + + # Read existing wishlist + existing = set() + if wishlist_path.exists(): + with open(wishlist_path) as f: + for line in f: + if line.startswith("- [ ] "): + existing.add(line.strip()[6:].split(" — ")[0]) + + # Add new entries to markdown wishlist + with open(wishlist_path, "a") as f: + if not wishlist_path.exists() or wishlist_path.stat().st_size == 0: + f.write("# Music Wishlist\n\n") + f.write("Albums that couldn't be found on Soulseek.\n") + f.write("Search Discogs/Bandcamp manually or check back later.\n\n") + + for album in albums: + clean = album.replace(" (download failed)", "").replace(" (not found)", "").replace(" (queue failed)", "") + clean = clean.split(" (timeout")[0] # Remove timeout messages + if clean not in existing: + query = clean.replace(" - ", " ").replace(" ", "+") + discogs_url = f"https://www.discogs.com/search/?q={query}&type=release" + f.write(f"- [ ] {clean} — [Discogs]({discogs_url})\n") + + # Also add to Floccus bookmarks + self._add_to_bookmarks(clean, discogs_url) + + def _add_to_bookmarks(self, title: str, url: str): + """Add album to Floccus bookmarks in Music/Wishlist folder.""" + import subprocess + + add_bookmark_script = Path(__file__).parent / "add_bookmark_to_wishlist.js" + if add_bookmark_script.exists(): + try: + subprocess.run( + ["node", str(add_bookmark_script), url, title], + capture_output=True, + timeout=30 + ) + print(f" 📌 Added to bookmarks: {title}") + except Exception as e: + print(f" ⚠ Bookmark add failed: {e}") + + def _load_state(self) -> dict: + if STATE_PATH.exists(): + with open(STATE_PATH) as f: + return json.load(f) + return {"downloaded_albums": []} + + def _save_downloaded(self, artist: str, album: str, reason: str = None): + state = self._load_state() + key = f"{artist.lower()} - {album.lower()}" + if key not in state["downloaded_albums"]: + state["downloaded_albums"].append(key) + # Keep last 100 + state["downloaded_albums"] = state["downloaded_albums"][-100:] + state["last_download"] = { + "artist": artist, + "album": album, + "reason": reason, + "timestamp": datetime.now().isoformat(), + } + with open(STATE_PATH, "w") as f: + json.dump(state, f, indent=2) + + def format_recommendations(self, recs: list[dict]) -> str: + lines = ["# Music Recommendations\n"] + lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n") + + for i, rec in enumerate(recs, 1): + lines.append(f"{i}. **{rec['name']}**") + if "because_of" in rec: + lines.append(f" Similar to: {rec['because_of']}") + lines.append("") + + return "\n".join(lines) + + def format_gaps(self, gaps: list[dict]) -> str: + lines = ["# Library Gaps\n"] + lines.append("Artists you listen to but don't have in Navidrome:\n") + + for i, gap in enumerate(gaps, 1): + lines.append(f"{i}. **{gap['name']}** ({gap['playcount']} plays)") + + return "\n".join(lines) + + +def load_config(path: Path) -> dict: + if not path.exists(): + print(f"Config not found: {path}") + print("Create it with your API keys. See music_config.example.json") + sys.exit(1) + + with open(path) as f: + return json.load(f) + + +def main(): + parser = argparse.ArgumentParser(description="Music recommendation tool") + parser.add_argument("command", nargs="?", default="recommend", + choices=["recommend", "gaps", "recent", "search", "test", + "surprise", "slskd", "download"], + help="Command to run") + parser.add_argument("query", nargs="?", help="Search query") + parser.add_argument("--config", type=Path, default=CONFIG_PATH, + help="Path to config file") + parser.add_argument("--json", action="store_true", help="Output as JSON") + parser.add_argument("--wait", action="store_true", + help="Wait for download to complete (for overnight jobs)") + parser.add_argument("--timeout", type=int, default=30, + help="Timeout in minutes for download completion (default: 30)") + + args = parser.parse_args() + config = load_config(args.config) + recommender = MusicRecommender(config) + + if args.command == "test": + print("Testing Last.fm...", end=" ") + try: + recent = recommender.lastfm.get_recent_tracks(limit=1) + print(f"OK (last track: {recent[0]['track'][:30] if recent else 'none'}...)") + except Exception as e: + print(f"FAILED: {e}") + + if recommender.navidrome: + print("Testing Navidrome...", end=" ") + if recommender.navidrome.ping(): + artists = recommender.navidrome.get_artists() + print(f"OK ({len(artists)} artists)") + else: + print("FAILED") + + if recommender.slskd: + print("Testing slskd...", end=" ") + try: + recommender.slskd._request("GET", "server") + print("OK") + except Exception as e: + print(f"FAILED: {e}") + + elif args.command == "recommend": + recs = recommender.get_recommendations() + if args.json: + print(json.dumps(recs, indent=2)) + else: + print(recommender.format_recommendations(recs)) + + elif args.command == "gaps": + gaps = recommender.find_gaps() + if args.json: + print(json.dumps(gaps, indent=2)) + else: + print(recommender.format_gaps(gaps)) + + elif args.command == "recent": + recent = recommender.lastfm.get_recent_tracks(limit=20) + if args.json: + print(json.dumps(recent, indent=2)) + else: + print("# Recent Tracks\n") + for t in recent: + status = "🎵 NOW" if t["now_playing"] else " " + print(f"{status} {t['artist']} - {t['track']}") + + elif args.command == "search": + if not args.query: + print("Search requires a query") + sys.exit(1) + + if recommender.navidrome: + print(f"Searching Navidrome for '{args.query}'...") + results = recommender.navidrome.search(args.query) + if results["artists"]: + print("\nArtists in library:") + for a in results["artists"][:5]: + print(f" - {a['name']}") + if results["albums"]: + print("\nAlbums in library:") + for a in results["albums"][:5]: + print(f" - {a['artist']} - {a['name']}") + + elif args.command == "surprise": + result = recommender.surprise_download( + wait_for_completion=args.wait, + timeout_minutes=args.timeout, + ) + if args.json: + print(json.dumps(result, indent=2)) + else: + if result["success"]: + status = result.get("status", "queued") + print(f"\n✅ {'Download complete!' if status == 'completed' else 'Queued for download!'}") + print(f" {result['artist']} - {result['album']}") + print(f" {result['files']} files {'(FLAC)' if result['has_flac'] else '(MP3)'}") + print(f" From: {result['source']}") + if result.get("attempts", 1) > 1: + print(f" (took {result['attempts']} attempts)") + else: + print(f"\n❌ {result['error']}") + if "tried" in result: + print(" Tried:") + for t in result["tried"]: + print(f" - {t}") + + elif args.command == "slskd": + if not args.query: + print("slskd search requires a query") + sys.exit(1) + + if not recommender.slskd: + print("slskd not configured") + sys.exit(1) + + print(f"Searching Soulseek for '{args.query}'...") + search_id = recommender.slskd.search(args.query) + recommender.slskd.wait_for_search(search_id, timeout=30) + # Brief pause to ensure /responses endpoint is populated + time.sleep(2) + responses = recommender.slskd.get_search_responses(search_id) + + print(f"\nFound {len(responses)} users with results:\n") + for resp in responses[:10]: + username = resp.get("username", "?") + files = resp.get("files", []) + has_slot = "✓" if resp.get("hasFreeUploadSlot") else "✗" + print(f" {has_slot} {username}: {len(files)} files") + + elif args.command == "download": + if not args.query: + print("download requires 'artist - album'") + sys.exit(1) + + if not recommender.slskd: + print("slskd not configured") + sys.exit(1) + + # Parse "artist - album" + if " - " in args.query: + artist, album = args.query.split(" - ", 1) + else: + print("Format: 'Artist - Album'") + sys.exit(1) + + print(f"Searching for: {artist} - {album}") + match = recommender.slskd.search_and_find_album(artist, album) + + if match: + print(f"\nBest match: {match['file_count']} files from {match['username']}") + print(f"FLAC: {'Yes' if match['has_flac'] else 'No'}") + print(f"Directory: {match['directory'][:60]}...") + + if recommender.slskd.download_album(match): + print("\n✅ Download queued!") + else: + print("\n❌ Download failed") + else: + print("\n❌ No suitable results found") + + +if __name__ == "__main__": + main() diff --git a/overnight_transcoder.py b/overnight_transcoder.py new file mode 100755 index 0000000..8a6d7d0 --- /dev/null +++ b/overnight_transcoder.py @@ -0,0 +1,1245 @@ +#!/usr/bin/env python3 +""" +Overnight Video Transcoder +Transcodes large video files to HEVC to save space. +Runs via cron between 02:00-07:00. + +Usage: + overnight_transcoder.py # Normal run + overnight_transcoder.py --dry # Dry run - show what would be transcoded + overnight_transcoder.py --stats # Show statistics + overnight_transcoder.py --verbose # Extra debug output +""" + +import os +import sys +import json +import subprocess +import shutil +import logging +import argparse +import sqlite3 +import traceback +import re +import signal +from pathlib import Path +from datetime import datetime, time +from typing import Optional, List, Dict, Tuple +from contextlib import contextmanager + +# Parse arguments +parser = argparse.ArgumentParser(description="Overnight video transcoder") +parser.add_argument("--dry", action="store_true", help="Dry run - don't transcode, just show what would happen") +parser.add_argument("--stats", action="store_true", help="Show cache statistics and exit") +parser.add_argument("--failed", action="store_true", help="Show failed transcodes and exit") +parser.add_argument("--retry-failed", action="store_true", help="Reset failed files to pending for retry") +parser.add_argument("--clear-cache", action="store_true", help="Clear the cache and exit") +parser.add_argument("--verbose", "-v", action="store_true", help="Verbose debug output") +parser.add_argument("--json-log", action="store_true", help="Output structured JSON logs (one per line)") +args = parser.parse_args() + +DRY_RUN = args.dry +VERBOSE = args.verbose +JSON_LOG = args.json_log + +# Track current transcode for signal handling +CURRENT_TRANSCODE = {"path": None, "pid": None, "start_time": None} + +# Configuration +CONFIG = { + "source_dirs": [ + "/disks/Plex/Anime", + "/disks/Plex/Films", + "/disks/Plex/TV", + ], + "cleanup_dir": "/disks/Plex/.cleanup", + "log_dir": "/var/log", + "log_file": "/var/log/transcoder.log", + "db_file": "/var/lib/transcoder/cache.db", + "video_extensions": [".mkv", ".mp4", ".avi", ".m4v", ".mov", ".wmv"], + "cutoff_time": time(6, 30), # Don't start new encodes after 06:30 + "cleanup_days": 7, # Delete originals after 7 days + + # FFmpeg settings + "crf": 20, + "preset": "slow", + "audio_codec": "ac3", + "audio_bitrate": "640k", + "threads": 12, +} + + +def init_db() -> sqlite3.Connection: + """Initialize the SQLite database for caching file info.""" + db_path = Path(CONFIG["db_file"]) + db_path.parent.mkdir(parents=True, exist_ok=True) + + conn = sqlite3.connect(CONFIG["db_file"]) + + # Main files table - tracks all video files + conn.execute(""" + CREATE TABLE IF NOT EXISTS files ( + path TEXT PRIMARY KEY, + filename TEXT, + directory TEXT, + original_size INTEGER, + mtime REAL, + video_codec TEXT, + audio_codec TEXT, + is_hevc INTEGER, + first_seen REAL, + scanned_at REAL, + + -- Transcode info (NULL if not transcoded) + status TEXT DEFAULT 'pending', -- pending, transcoding, done, failed, stalled + transcoded_at REAL, + transcoded_size INTEGER, + transcode_duration_secs REAL, + transcode_settings TEXT, -- JSON: {crf, preset, audio_codec, audio_bitrate} + failure_reason TEXT, + + -- Computed (updated after transcode) + space_saved INTEGER, + compression_ratio REAL + ) + """) + conn.execute("CREATE INDEX IF NOT EXISTS idx_is_hevc ON files(is_hevc)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_status ON files(status)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_size ON files(original_size)") + conn.execute("CREATE INDEX IF NOT EXISTS idx_directory ON files(directory)") + + # Stats summary table - running totals + conn.execute(""" + CREATE TABLE IF NOT EXISTS stats ( + id INTEGER PRIMARY KEY CHECK (id = 1), -- Single row + total_files_transcoded INTEGER DEFAULT 0, + total_space_saved INTEGER DEFAULT 0, + total_transcode_time_secs REAL DEFAULT 0, + last_updated REAL + ) + """) + conn.execute("INSERT OR IGNORE INTO stats (id) VALUES (1)") + + conn.commit() + return conn + + +def get_audio_codec(filepath: str) -> Optional[str]: + """Get the audio codec of a file using ffprobe.""" + try: + result = subprocess.run([ + "ffprobe", "-v", "quiet", + "-select_streams", "a:0", + "-show_entries", "stream=codec_name", + "-of", "json", + filepath + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + data = json.loads(result.stdout) + streams = data.get("streams", []) + if streams: + return streams[0].get("codec_name", "").lower() + except Exception as e: + log.warning(f"Failed to probe audio {filepath}: {e}") + return None + + +def get_cached_file(conn: sqlite3.Connection, filepath: str, size: int, mtime: float) -> Optional[dict]: + """Get cached file info if file hasn't changed. Returns dict or None.""" + cursor = conn.execute( + """SELECT video_codec, audio_codec, is_hevc, status + FROM files WHERE path = ? AND original_size = ? AND mtime = ?""", + (filepath, size, mtime) + ) + row = cursor.fetchone() + if row: + return { + "video_codec": row[0], + "audio_codec": row[1], + "is_hevc": bool(row[2]), + "status": row[3] + } + return None + + +def cache_file(conn: sqlite3.Connection, filepath: str, size: int, mtime: float, + video_codec: str, audio_codec: str, is_hevc: bool): + """Cache file info.""" + p = Path(filepath) + now = datetime.now().timestamp() + + # Check if file already exists (to preserve first_seen) + cursor = conn.execute("SELECT first_seen FROM files WHERE path = ?", (filepath,)) + row = cursor.fetchone() + first_seen = row[0] if row else now + + conn.execute(""" + INSERT OR REPLACE INTO files + (path, filename, directory, original_size, mtime, video_codec, audio_codec, + is_hevc, first_seen, scanned_at, status) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, + COALESCE((SELECT status FROM files WHERE path = ?), 'pending')) + """, (filepath, p.name, str(p.parent), size, mtime, video_codec, audio_codec, + int(is_hevc), first_seen, now, filepath)) + conn.commit() + + +def update_transcode_result(conn: sqlite3.Connection, filepath: str, new_path: str, + new_size: int, duration_secs: float, settings: dict, + success: bool, failure_reason: str = None): + """Update database after transcode attempt.""" + now = datetime.now().timestamp() + + if success: + cursor = conn.execute("SELECT original_size FROM files WHERE path = ?", (filepath,)) + row = cursor.fetchone() + original_size = row[0] if row else 0 + + space_saved = original_size - new_size + compression_ratio = new_size / original_size if original_size > 0 else 0 + + conn.execute(""" + UPDATE files SET + status = 'done', + transcoded_at = ?, + transcoded_size = ?, + transcode_duration_secs = ?, + transcode_settings = ?, + space_saved = ?, + compression_ratio = ?, + failure_reason = NULL + WHERE path = ? + """, (now, new_size, duration_secs, json.dumps(settings), + space_saved, compression_ratio, filepath)) + + # Update running totals + conn.execute(""" + UPDATE stats SET + total_files_transcoded = total_files_transcoded + 1, + total_space_saved = total_space_saved + ?, + total_transcode_time_secs = total_transcode_time_secs + ?, + last_updated = ? + WHERE id = 1 + """, (space_saved, duration_secs, now)) + else: + conn.execute(""" + UPDATE files SET + status = 'failed', + failure_reason = ? + WHERE path = ? + """, (failure_reason, filepath)) + + conn.commit() + + +def remove_from_cache(conn: sqlite3.Connection, filepath: str): + """Remove a file from the cache.""" + conn.execute("DELETE FROM files WHERE path = ?", (filepath,)) + conn.commit() + + +def get_cache_stats(conn: sqlite3.Connection) -> dict: + """Get comprehensive stats from the cache.""" + # File counts and sizes + cursor = conn.execute(""" + SELECT + COUNT(*) as total, + SUM(CASE WHEN is_hevc = 1 THEN 1 ELSE 0 END) as hevc_count, + SUM(CASE WHEN is_hevc = 0 AND status = 'pending' THEN 1 ELSE 0 END) as pending_count, + SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as done_count, + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed_count, + SUM(CASE WHEN is_hevc = 1 THEN original_size ELSE 0 END) as hevc_size, + SUM(CASE WHEN is_hevc = 0 AND status = 'pending' THEN original_size ELSE 0 END) as pending_size + FROM files + """) + row = cursor.fetchone() + file_stats = { + "total": row[0] or 0, + "hevc_count": row[1] or 0, + "pending_count": row[2] or 0, + "done_count": row[3] or 0, + "failed_count": row[4] or 0, + "hevc_size": row[5] or 0, + "pending_size": row[6] or 0, + } + + # Running totals + cursor = conn.execute("SELECT * FROM stats WHERE id = 1") + row = cursor.fetchone() + if row: + file_stats["total_transcoded"] = row[1] or 0 + file_stats["total_space_saved"] = row[2] or 0 + file_stats["total_transcode_time"] = row[3] or 0 + + # By directory breakdown + cursor = conn.execute(""" + SELECT directory, + COUNT(*) as count, + SUM(CASE WHEN is_hevc = 0 AND status = 'pending' THEN original_size ELSE 0 END) as pending_size + FROM files + GROUP BY directory + ORDER BY pending_size DESC + LIMIT 10 + """) + file_stats["by_directory"] = cursor.fetchall() + + return file_stats + +# Ensure log directory exists (skip for /var/log which requires root) +if not CONFIG["log_dir"].startswith("/var/"): + Path(CONFIG["log_dir"]).mkdir(parents=True, exist_ok=True) + +# Set up logging +log_level = logging.DEBUG if VERBOSE else logging.INFO +logging.basicConfig( + level=log_level, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.FileHandler(CONFIG["log_file"]), + logging.StreamHandler(sys.stdout) + ] +) +log = logging.getLogger(__name__) + + +def log_json(event: str, **data): + """Write structured JSON log entry (for machine parsing).""" + if not JSON_LOG: + return + entry = { + "ts": datetime.now().isoformat(), + "event": event, + **data + } + print(json.dumps(entry), file=sys.stderr) + + +def format_bytes(size: int) -> str: + """Human-readable byte size.""" + for unit in ['B', 'KB', 'MB', 'GB', 'TB']: + if abs(size) < 1024.0: + return f"{size:.2f} {unit}" + size /= 1024.0 + return f"{size:.2f} PB" + + +def format_duration(seconds: float) -> str: + """Human-readable duration.""" + if seconds < 60: + return f"{seconds:.1f}s" + elif seconds < 3600: + return f"{seconds/60:.1f}m" + else: + return f"{seconds/3600:.1f}h" + + +def get_disk_space(path: str) -> dict: + """Get disk space info for a path.""" + try: + stat = os.statvfs(path) + total = stat.f_blocks * stat.f_frsize + free = stat.f_bavail * stat.f_frsize + used = total - free + return { + "total": total, + "free": free, + "used": used, + "percent_used": (used / total * 100) if total > 0 else 0 + } + except Exception as e: + log.warning(f"Failed to get disk space for {path}: {e}") + return None + + +def check_disk_space(): + """Log disk space for all source directories.""" + log.info("=" * 50) + log.info("Disk Space Check") + log.info("=" * 50) + + checked_mounts = set() + for source_dir in CONFIG["source_dirs"]: + # Get mount point to avoid duplicate checks + try: + mount = subprocess.run( + ["df", "--output=target", source_dir], + capture_output=True, text=True + ).stdout.strip().split('\n')[-1] + except: + mount = source_dir + + if mount in checked_mounts: + continue + checked_mounts.add(mount) + + space = get_disk_space(source_dir) + if space: + log.info(f" {mount}:") + log.info(f" Total: {format_bytes(space['total'])}") + log.info(f" Free: {format_bytes(space['free'])} ({100-space['percent_used']:.1f}% available)") + log_json("disk_space", mount=mount, **space) + + if space['percent_used'] > 90: + log.warning(f" ⚠️ LOW DISK SPACE on {mount}!") + log.info("") + + +def get_video_duration(filepath: str) -> Optional[float]: + """Get video duration in seconds using ffprobe.""" + try: + result = subprocess.run([ + "ffprobe", "-v", "quiet", + "-show_entries", "format=duration", + "-of", "json", + filepath + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + data = json.loads(result.stdout) + return float(data.get("format", {}).get("duration", 0)) + except Exception as e: + log.debug(f"Failed to get duration for {filepath}: {e}") + return None + + +def handle_interrupt(signum, frame): + """Handle SIGINT/SIGTERM gracefully.""" + log.warning("=" * 50) + log.warning("INTERRUPT RECEIVED - cleaning up...") + log.warning("=" * 50) + + if CURRENT_TRANSCODE["path"]: + log.warning(f"Interrupted transcode: {CURRENT_TRANSCODE['path']}") + if CURRENT_TRANSCODE["start_time"]: + elapsed = (datetime.now() - CURRENT_TRANSCODE["start_time"]).total_seconds() + log.warning(f"Elapsed time: {format_duration(elapsed)}") + + # Try to kill ffmpeg if running + if CURRENT_TRANSCODE["pid"]: + try: + os.kill(CURRENT_TRANSCODE["pid"], signal.SIGTERM) + log.info("Sent SIGTERM to ffmpeg process") + except: + pass + + # Clean up partial output + temp_path = Path(CURRENT_TRANSCODE["path"]).with_suffix(".transcoding.mkv") + if temp_path.exists(): + log.info(f"Removing partial output: {temp_path}") + temp_path.unlink() + + log_json("interrupted", + file=CURRENT_TRANSCODE["path"], + elapsed=elapsed if CURRENT_TRANSCODE["start_time"] else None) + sys.exit(1) + + +# Register signal handlers +signal.signal(signal.SIGINT, handle_interrupt) +signal.signal(signal.SIGTERM, handle_interrupt) + + +def get_video_codec(filepath: str) -> Optional[str]: + """Get the video codec of a file using ffprobe.""" + try: + result = subprocess.run([ + "ffprobe", "-v", "quiet", + "-select_streams", "v:0", + "-show_entries", "stream=codec_name", + "-of", "json", + filepath + ], capture_output=True, text=True, timeout=30) + + if result.returncode == 0: + data = json.loads(result.stdout) + streams = data.get("streams", []) + if streams: + return streams[0].get("codec_name", "").lower() + except Exception as e: + log.warning(f"Failed to probe {filepath}: {e}") + return None + + +def is_hevc(filepath: str) -> bool: + """Check if a file is already HEVC encoded.""" + codec = get_video_codec(filepath) + return codec in ["hevc", "h265"] + + +def get_file_size_gb(filepath: str) -> float: + """Get file size in GB.""" + return os.path.getsize(filepath) / (1024 ** 3) + + +def find_videos_to_transcode(conn: sqlite3.Connection) -> List[Dict]: + """Find all non-HEVC videos, sorted by size (biggest first). Uses cache for speed.""" + videos = [] + cache_hits = 0 + cache_misses = 0 + files_scanned = 0 + dirs_scanned = 0 + + scan_start = datetime.now() + log.info("Scanning source directories...") + + for source_dir in CONFIG["source_dirs"]: + if not os.path.exists(source_dir): + log.warning(f" ⚠ Source directory not found: {source_dir}") + continue + + dir_start = datetime.now() + dir_files = 0 + dir_size = 0 + log.info(f" 📁 Scanning: {source_dir}") + + for root, dirs, files in os.walk(source_dir): + # Skip cleanup directory + if ".cleanup" in root: + continue + + dirs_scanned += 1 + + for filename in files: + ext = os.path.splitext(filename)[1].lower() + if ext not in CONFIG["video_extensions"]: + continue + + filepath = os.path.join(root, filename) + files_scanned += 1 + dir_files += 1 + + try: + stat = os.stat(filepath) + size = stat.st_size + mtime = stat.st_mtime + dir_size += size + + # Check cache first + cached = get_cached_file(conn, filepath, size, mtime) + if cached: + video_codec = cached["video_codec"] + audio_codec = cached["audio_codec"] + is_hevc = cached["is_hevc"] + status = cached["status"] + cache_hits += 1 + else: + # Need to probe this file + log.debug(f" Probing: {filename}") + video_codec = get_video_codec(filepath) + audio_codec = get_audio_codec(filepath) + is_hevc = video_codec in ["hevc", "h265"] if video_codec else False + status = "done" if is_hevc else "pending" + cache_file(conn, filepath, size, mtime, + video_codec or "unknown", audio_codec or "unknown", is_hevc) + cache_misses += 1 + + videos.append({ + "path": filepath, + "size": size, + "size_gb": size / (1024 ** 3), + "video_codec": video_codec, + "audio_codec": audio_codec, + "is_hevc": is_hevc, + "status": status + }) + except OSError as e: + log.warning(f" ⚠ Cannot access {filepath}: {e}") + + dir_elapsed = (datetime.now() - dir_start).total_seconds() + log.info(f" Found {dir_files} videos ({format_bytes(dir_size)}) in {format_duration(dir_elapsed)}") + log_json("scan_directory", + directory=source_dir, + files=dir_files, + size=dir_size, + elapsed=dir_elapsed) + + scan_elapsed = (datetime.now() - scan_start).total_seconds() + cache_rate = (cache_hits / (cache_hits + cache_misses) * 100) if (cache_hits + cache_misses) > 0 else 0 + + log.info("") + log.info(f"Scan complete in {format_duration(scan_elapsed)}") + log.info(f" Files scanned: {files_scanned}") + log.info(f" Dirs traversed: {dirs_scanned}") + log.info(f" Cache hits: {cache_hits} ({cache_rate:.1f}%)") + log.info(f" Cache misses: {cache_misses} (probed)") + log.info("") + + log_json("scan_complete", + files=files_scanned, + dirs=dirs_scanned, + cache_hits=cache_hits, + cache_misses=cache_misses, + elapsed=scan_elapsed) + + # Sort by size descending (biggest first) + videos.sort(key=lambda x: x["size"], reverse=True) + return videos + + +def is_past_cutoff() -> bool: + """Check if we're past the cutoff time for starting new encodes.""" + now = datetime.now().time() + return now >= CONFIG["cutoff_time"] + + +def verify_output(filepath: str, min_size_ratio: float = 0.1) -> bool: + """Verify the transcoded file is valid.""" + if not os.path.exists(filepath): + return False + + # Check file size is reasonable (at least 10% of some minimum) + size = os.path.getsize(filepath) + if size < 1024 * 1024: # Less than 1MB is definitely wrong + return False + + # Check it's valid video with ffprobe + codec = get_video_codec(filepath) + if codec != "hevc": + return False + + return True + + +def transcode_file(input_path: str, conn: sqlite3.Connection) -> Optional[dict]: + """Transcode a single file to HEVC. Returns result dict on success.""" + + input_file = Path(input_path) + output_path = input_file.with_suffix(".transcoding.mkv") + final_path = input_file.with_suffix(".mkv") + + original_size = os.path.getsize(input_path) + video_duration = get_video_duration(input_path) + + log.info("") + log.info("=" * 60) + log.info(f"TRANSCODE START: {input_file.name}") + log.info("=" * 60) + log.info(f" Input path: {input_path}") + log.info(f" Output path: {output_path}") + log.info(f" Original size: {format_bytes(original_size)}") + if video_duration: + log.info(f" Duration: {format_duration(video_duration)}") + + log_json("transcode_start", + file=input_path, + size=original_size, + duration=video_duration) + + # Mark as transcoding in DB + conn.execute("UPDATE files SET status = 'transcoding' WHERE path = ?", (input_path,)) + conn.commit() + + start_time = datetime.now() + CURRENT_TRANSCODE["path"] = input_path + CURRENT_TRANSCODE["start_time"] = start_time + + settings = { + "crf": CONFIG["crf"], + "preset": CONFIG["preset"], + "audio_codec": CONFIG["audio_codec"], + "audio_bitrate": CONFIG["audio_bitrate"], + "threads": CONFIG["threads"] + } + + # Build ffmpeg command + cmd = [ + "ffmpeg", "-y", + "-threads", str(CONFIG["threads"]), + "-progress", "pipe:1", # Progress to stdout for parsing + "-i", input_path, + "-c:v", "libx265", + "-preset", CONFIG["preset"], + "-crf", str(CONFIG["crf"]), + "-x265-params", f"pools={CONFIG['threads']}", + "-c:a", CONFIG["audio_codec"], + "-b:a", CONFIG["audio_bitrate"], + "-c:s", "copy", # Copy subtitles + str(output_path) + ] + + # Log the full command for debugging + cmd_str = ' '.join(f'"{c}"' if ' ' in c else c for c in cmd) + log.info(f" FFmpeg command:") + log.debug(f" {cmd_str}") + if not VERBOSE: + log.info(f" (use --verbose to see full command)") + log.info("") + log_json("ffmpeg_command", command=cmd_str) + + # Log stderr to file to prevent pipe buffer deadlock + stderr_log_path = Path(CONFIG["log_dir"]) / "transcoder_ffmpeg_stderr.log" + stderr_file = open(stderr_log_path, "a") + stderr_file.write(f"\n{'='*60}\n{datetime.now()} - {input_path}\n{'='*60}\n") + stderr_file.flush() + + try: + # Use Popen for progress monitoring + # stderr goes to file (not PIPE) to prevent buffer deadlock + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=stderr_file, + text=True + ) + CURRENT_TRANSCODE["pid"] = process.pid + log.debug(f" FFmpeg PID: {process.pid}") + + # Monitor progress + last_progress_log = datetime.now() + progress_interval = 60 # Log progress every 60 seconds + current_time_us = 0 + + import select + last_output_size = 0 + last_output_growth = datetime.now() + stall_timeout = 600 # 10 minutes with no output growth = stalled + + while True: + # Use select for non-blocking read with timeout + ready, _, _ = select.select([process.stdout], [], [], 30) + + if ready: + line = process.stdout.readline() + if not line and process.poll() is not None: + break + + # Parse progress output + if line.startswith("out_time_us="): + try: + current_time_us = int(line.split("=")[1]) + except: + pass + elif process.poll() is not None: + break + + # Log progress periodically + now = datetime.now() + if (now - last_progress_log).total_seconds() >= progress_interval: + elapsed = (now - start_time).total_seconds() + + if video_duration and current_time_us > 0: + current_secs = current_time_us / 1_000_000 + percent = (current_secs / video_duration) * 100 + eta_secs = (elapsed / percent * 100) - elapsed if percent > 0 else 0 + + log.info(f" ⏳ Progress: {percent:.1f}% | Elapsed: {format_duration(elapsed)} | ETA: {format_duration(eta_secs)}") + log_json("transcode_progress", + file=input_path, + percent=percent, + elapsed=elapsed, + eta=eta_secs) + else: + log.info(f" ⏳ Elapsed: {format_duration(elapsed)}") + + # Check output size growth and detect stalls + if output_path.exists(): + current_size = os.path.getsize(output_path) + log.info(f" Output size: {format_bytes(current_size)}") + + if current_size > last_output_size: + last_output_size = current_size + last_output_growth = now + elif (now - last_output_growth).total_seconds() > stall_timeout: + log.error(f" 🛑 STALL DETECTED: Output file hasn't grown in {stall_timeout}s") + log.error(f" Output stuck at {format_bytes(current_size)}") + log.error(f" Killing ffmpeg (PID {process.pid})") + process.kill() + process.wait(timeout=30) + + # Mark as stalled in DB with timestamp for 7-day cooldown + conn.execute( + "UPDATE files SET status = 'stalled', failure_reason = ?, transcoded_at = ? WHERE path = ?", + (f"Output stalled at {format_bytes(current_size)} after {format_duration(elapsed)}", + datetime.now().timestamp(), input_path) + ) + conn.commit() + + # Clean up partial output + if output_path.exists(): + output_path.unlink() + log.info(f" Cleaned up partial file: {output_path}") + + stderr_file.close() + return None + + last_progress_log = now + + # Wait for process to finish (stderr already going to file) + process.wait(timeout=60) + stderr_file.close() + + if process.returncode != 0: + # Read last 1000 chars of stderr log for error reporting + try: + with open(stderr_log_path, 'r') as f: + f.seek(max(0, os.path.getsize(stderr_log_path) - 1000)) + error_msg = f.read() + except: + error_msg = "Unknown error (check transcoder_ffmpeg_stderr.log)" + log.error("=" * 60) + log.error("TRANSCODE FAILED") + log.error("=" * 60) + log.error(f" File: {input_path}") + log.error(f" Exit code: {process.returncode}") + log.error(f" FFmpeg error output:") + for line in error_msg.split('\n')[-20:]: # Last 20 lines + if line.strip(): + log.error(f" {line}") + log.error("=" * 60) + + log_json("transcode_failed", + file=input_path, + exit_code=process.returncode, + error=error_msg[-500:]) + + if output_path.exists(): + output_path.unlink() + update_transcode_result(conn, input_path, None, 0, 0, settings, False, error_msg[-500:]) + CURRENT_TRANSCODE["path"] = None + return None + + except subprocess.TimeoutExpired: + log.error("=" * 60) + log.error(f"TRANSCODE TIMEOUT (8 hours): {input_path}") + log.error("=" * 60) + log_json("transcode_timeout", file=input_path) + process.kill() + if output_path.exists(): + output_path.unlink() + update_transcode_result(conn, input_path, None, 0, 0, settings, False, "Timeout (8 hours)") + CURRENT_TRANSCODE["path"] = None + return None + except Exception as e: + log.error("=" * 60) + log.error(f"TRANSCODE ERROR: {input_path}") + log.error("=" * 60) + log.error(f" Exception: {type(e).__name__}: {e}") + log.error(f" Traceback:") + for line in traceback.format_exc().split('\n'): + if line.strip(): + log.error(f" {line}") + log.error("=" * 60) + + log_json("transcode_error", + file=input_path, + error=str(e), + traceback=traceback.format_exc()) + + if output_path.exists(): + output_path.unlink() + update_transcode_result(conn, input_path, None, 0, 0, settings, False, str(e)) + CURRENT_TRANSCODE["path"] = None + return None + + # Verify output + log.info(" Verifying output file...") + if not verify_output(str(output_path)): + log.error("=" * 60) + log.error("OUTPUT VERIFICATION FAILED") + log.error("=" * 60) + log.error(f" File: {output_path}") + if output_path.exists(): + log.error(f" Output size: {format_bytes(os.path.getsize(output_path))}") + output_codec = get_video_codec(str(output_path)) + log.error(f" Output codec: {output_codec}") + else: + log.error(" Output file does not exist!") + log.error("=" * 60) + + log_json("transcode_verify_failed", file=input_path) + if output_path.exists(): + output_path.unlink() + update_transcode_result(conn, input_path, None, 0, 0, settings, False, "Output verification failed") + CURRENT_TRANSCODE["path"] = None + return None + + new_size = os.path.getsize(output_path) + duration_secs = (datetime.now() - start_time).total_seconds() + space_saved = original_size - new_size + compression_ratio = new_size / original_size if original_size > 0 else 0 + encode_speed = video_duration / duration_secs if video_duration and duration_secs > 0 else 0 + + log.info("") + log.info("=" * 60) + log.info("✅ TRANSCODE COMPLETE") + log.info("=" * 60) + log.info(f" File: {input_file.name}") + log.info(f" Duration: {format_duration(duration_secs)}") + log.info(f" Original: {format_bytes(original_size)}") + log.info(f" New size: {format_bytes(new_size)}") + log.info(f" Saved: {format_bytes(space_saved)} ({100*(1-compression_ratio):.1f}% reduction)") + log.info(f" Ratio: {compression_ratio:.2f}x") + if encode_speed > 0: + log.info(f" Speed: {encode_speed:.2f}x realtime") + log.info("=" * 60) + log.info("") + + log_json("transcode_complete", + file=input_path, + original_size=original_size, + new_size=new_size, + space_saved=space_saved, + duration_secs=duration_secs, + compression_ratio=compression_ratio, + encode_speed=encode_speed) + + # Move original to cleanup directory + cleanup_dir = Path(CONFIG["cleanup_dir"]) + cleanup_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + cleanup_name = f"{timestamp}_{input_file.name}" + cleanup_path = cleanup_dir / cleanup_name + + log.info(f" Moving original to cleanup: {cleanup_path}") + try: + shutil.move(input_path, cleanup_path) + log.info(f" ✓ Original moved successfully") + log_json("original_moved", source=input_path, dest=str(cleanup_path)) + except Exception as e: + log.error(f" ✗ Failed to move original: {e}") + log.error(f" Traceback: {traceback.format_exc()}") + # Don't fail the whole operation, but keep both files + + # Rename output to final name + log.info(f" Renaming output: {output_path} -> {final_path}") + try: + output_path.rename(final_path) + log.info(f" ✓ Output renamed successfully") + except Exception as e: + log.error(f" ✗ Failed to rename output: {e}") + final_path = output_path + + # Fix permissions - mediaserver group + log.debug(" Setting permissions (mediaserver:664)") + try: + subprocess.run(["chgrp", "mediaserver", str(final_path)], check=False) + subprocess.run(["chmod", "664", str(final_path)], check=False) + log.debug(" ✓ Permissions set") + except Exception as e: + log.warning(f" ✗ Failed to set permissions: {e}") + + # Update database with success + update_transcode_result(conn, input_path, str(final_path), new_size, duration_secs, settings, True) + + CURRENT_TRANSCODE["path"] = None + return { + "output_path": str(final_path), + "new_size": new_size, + "duration_secs": duration_secs, + "space_saved": space_saved + } + + +def cleanup_old_files(): + """Delete original files older than cleanup_days.""" + cleanup_dir = Path(CONFIG["cleanup_dir"]) + if not cleanup_dir.exists(): + log.debug("Cleanup directory doesn't exist, skipping cleanup") + return + + log.info("Checking cleanup directory for old originals...") + cutoff = datetime.now().timestamp() - (CONFIG["cleanup_days"] * 24 * 3600) + cutoff_date = datetime.fromtimestamp(cutoff).strftime('%Y-%m-%d %H:%M') + log.debug(f" Cutoff date: {cutoff_date} ({CONFIG['cleanup_days']} days ago)") + + deleted_count = 0 + deleted_size = 0 + kept_count = 0 + kept_size = 0 + + for filepath in cleanup_dir.iterdir(): + if filepath.is_file(): + mtime = filepath.stat().st_mtime + size = filepath.stat().st_size + + if mtime < cutoff: + try: + filepath.unlink() + deleted_count += 1 + deleted_size += size + log.info(f" 🗑️ Deleted: {filepath.name} ({format_bytes(size)})") + log_json("cleanup_deleted", file=str(filepath), size=size) + except Exception as e: + log.warning(f" ⚠ Failed to delete {filepath}: {e}") + else: + kept_count += 1 + kept_size += size + days_old = (datetime.now().timestamp() - mtime) / 86400 + log.debug(f" ⏳ Keeping: {filepath.name} ({days_old:.1f} days old)") + + if deleted_count > 0: + log.info(f" Cleanup: deleted {deleted_count} files, freed {format_bytes(deleted_size)}") + log_json("cleanup_complete", deleted=deleted_count, freed=deleted_size) + else: + log.info(f" Cleanup: nothing to delete ({kept_count} files still in retention)") + + if kept_count > 0: + log.debug(f" Retention: {kept_count} files ({format_bytes(kept_size)}) waiting)") + + +def main(): + # Initialize database + conn = init_db() + + # Handle --stats flag + if args.stats: + stats = get_cache_stats(conn) + print("=" * 60) + print("Transcoder Statistics") + print("=" * 60) + print(f"Total files scanned: {stats['total']}") + print(f"Already HEVC: {stats['hevc_count']} ({stats['hevc_size']/(1024**3):.2f} GB)") + print(f"Pending transcode: {stats['pending_count']} ({stats['pending_size']/(1024**3):.2f} GB)") + print(f"Completed: {stats['done_count']}") + print(f"Failed: {stats['failed_count']}") + print("") + print("--- Lifetime Stats ---") + print(f"Total transcoded: {stats.get('total_transcoded', 0)} files") + print(f"Total space saved: {stats.get('total_space_saved', 0)/(1024**3):.2f} GB") + total_time = stats.get('total_transcode_time', 0) + print(f"Total transcode time: {total_time/3600:.1f} hours") + if stats['pending_count'] > 0: + est_savings = stats['pending_size'] * 0.5 + print(f"\nEstimated remaining: ~{est_savings/(1024**3):.2f} GB savings") + + if stats.get('by_directory'): + print("\n--- Top Directories (by pending size) ---") + for dir_path, count, pending in stats['by_directory'][:5]: + if pending > 0: + # Shorten path for display + short = dir_path.replace("/disks/Plex/", "") + print(f" {short}: {pending/(1024**3):.2f} GB pending") + + print("=" * 60) + conn.close() + return + + # Handle --clear-cache flag + if args.clear_cache: + conn.execute("DELETE FROM files") + conn.execute("UPDATE stats SET total_files_transcoded=0, total_space_saved=0, total_transcode_time_secs=0") + conn.commit() + print("Cache cleared.") + conn.close() + return + + # Handle --failed flag + if args.failed: + cursor = conn.execute(""" + SELECT path, original_size, failure_reason + FROM files WHERE status = 'failed' + ORDER BY original_size DESC + """) + rows = cursor.fetchall() + print("=" * 60) + print(f"Failed Transcodes ({len(rows)} files)") + print("=" * 60) + for path, size, reason in rows: + print(f"\n{path}") + print(f" Size: {size/(1024**3):.2f} GB") + print(f" Reason: {reason}") + if not rows: + print("No failed transcodes!") + print("=" * 60) + conn.close() + return + + # Handle --retry-failed flag + if args.retry_failed: + cursor = conn.execute("UPDATE files SET status = 'pending', failure_reason = NULL WHERE status = 'failed'") + count = cursor.rowcount + conn.commit() + print(f"Reset {count} failed files to pending.") + conn.close() + return + + session_start = datetime.now() + + log.info("") + log.info("=" * 70) + log.info(" OVERNIGHT VIDEO TRANSCODER") + log.info("=" * 70) + log.info(f" Started: {session_start.strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f" Mode: {'DRY RUN' if DRY_RUN else 'LIVE'}") + log.info(f" Verbose: {'YES' if VERBOSE else 'NO'}") + log.info(f" JSON log: {'YES' if JSON_LOG else 'NO'}") + log.info(f" Cutoff: {CONFIG['cutoff_time']}") + log.info(f" Settings: CRF={CONFIG['crf']}, preset={CONFIG['preset']}, audio={CONFIG['audio_codec']}@{CONFIG['audio_bitrate']}") + log.info("=" * 70) + log.info("") + + log_json("session_start", + mode="dry_run" if DRY_RUN else "live", + config=CONFIG) + + # Check disk space first + check_disk_space() + + # Run cleanup first (skip in dry run) + if not DRY_RUN: + cleanup_old_files() + + # Find videos to transcode + videos = find_videos_to_transcode(conn) + + # Filter to non-HEVC pending only (using cached codec info) + to_transcode = [] + hevc_count = 0 + hevc_size = 0 + + for video in videos: + if video.get("is_hevc", False) or video.get("status") == "done": + hevc_count += 1 + hevc_size += video["size"] + elif video.get("status") == "pending": + to_transcode.append(video) + + log.info(f"Already HEVC: {hevc_count} files ({hevc_size / (1024**3):.2f} GB)") + log.info(f"Need transcoding: {len(to_transcode)} files") + + if DRY_RUN: + log.info("") + log.info("=" * 60) + log.info("DRY RUN - Files that would be transcoded (biggest first):") + log.info("=" * 60) + + total_size = 0 + estimated_savings = 0 + + for i, video in enumerate(to_transcode[:20], 1): # Show top 20 + size_gb = video["size_gb"] + total_size += video["size"] + # Estimate ~50% compression for HEVC + est_saved = video["size"] * 0.5 + estimated_savings += est_saved + + log.info(f"{i:2}. [{size_gb:6.2f} GB] {video['path']}") + + if len(to_transcode) > 20: + log.info(f" ... and {len(to_transcode) - 20} more files") + + log.info("") + log.info("=" * 60) + log.info(f"Total to transcode: {total_size / (1024**3):.2f} GB") + log.info(f"Estimated savings (~50%): {estimated_savings / (1024**3):.2f} GB") + log.info("=" * 60) + return + + # Normal run + transcoded_count = 0 + total_saved = 0 + failed_count = 0 + skipped_count = 0 + + log.info("") + log.info("=" * 60) + log.info("Starting transcode queue") + log.info("=" * 60) + log.info(f" Files to process: {len(to_transcode)}") + log.info(f" Total size: {format_bytes(sum(v['size'] for v in to_transcode))}") + log.info("") + + for i, video in enumerate(to_transcode, 1): + # Check cutoff time + if is_past_cutoff(): + remaining = len(to_transcode) - i + 1 + log.info("") + log.info("=" * 60) + log.info(f"⏰ CUTOFF TIME REACHED ({CONFIG['cutoff_time']})") + log.info(f" Stopping with {remaining} files remaining in queue") + log.info("=" * 60) + log_json("cutoff_reached", remaining_files=remaining) + break + + # Skip already done or failed + if video.get("status") in ["done", "failed"]: + log.debug(f" Skipping (status={video['status']}): {video['path']}") + skipped_count += 1 + continue + + # Skip stalled files unless 7 days have passed + if video.get("status") == "stalled": + row = conn.execute( + "SELECT transcoded_at FROM files WHERE path = ?", (video['path'],) + ).fetchone() + if row and row[0]: + stalled_at = datetime.fromtimestamp(row[0]) + days_since = (datetime.now() - stalled_at).days + if days_since < 7: + log.debug(f" Skipping stalled file ({days_since}d ago, retry after 7d): {video['path']}") + skipped_count += 1 + continue + else: + log.info(f" Retrying stalled file ({days_since}d cooldown passed): {video['path']}") + else: + log.debug(f" Skipping stalled file (no timestamp): {video['path']}") + skipped_count += 1 + continue + + log.info(f"[{i}/{len(to_transcode)}] Queued: {Path(video['path']).name}") + log.info(f" Size: {format_bytes(video['size'])}") + log.info(f" Codec: video={video.get('video_codec', 'unknown')}, audio={video.get('audio_codec', 'unknown')}") + log.info(f" Path: {video['path']}") + + log_json("transcode_queued", + index=i, + total=len(to_transcode), + file=video['path'], + size=video['size'], + video_codec=video.get('video_codec'), + audio_codec=video.get('audio_codec')) + + # Transcode + result = transcode_file(video["path"], conn) + + if result: + total_saved += result["space_saved"] + transcoded_count += 1 + log.info(f" Running totals: {transcoded_count} done, {format_bytes(total_saved)} saved") + else: + failed_count += 1 + log.error(f" ❌ Transcode #{i} failed: {video['path']}") + log.info(f" Running totals: {transcoded_count} done, {failed_count} failed") + + session_end = datetime.now() + session_duration = (session_end - session_start).total_seconds() + + log.info("") + log.info("=" * 70) + log.info(" SESSION COMPLETE") + log.info("=" * 70) + log.info(f" Started: {session_start.strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f" Ended: {session_end.strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f" Duration: {format_duration(session_duration)}") + log.info(f" Transcoded: {transcoded_count} files") + log.info(f" Failed: {failed_count} files") + log.info(f" Skipped: {skipped_count} files") + log.info(f" Space saved: {format_bytes(total_saved)}") + + # Get updated lifetime stats + stats = get_cache_stats(conn) + log.info("") + log.info(" --- Lifetime Totals ---") + log.info(f" Total transcoded: {stats.get('total_transcoded', 0)} files") + log.info(f" Total saved: {format_bytes(stats.get('total_space_saved', 0))}") + log.info(f" Pending: {stats.get('pending_count', 0)} files ({format_bytes(stats.get('pending_size', 0))})") + log.info("=" * 70) + log.info("") + + log_json("session_complete", + started=session_start.isoformat(), + ended=session_end.isoformat(), + duration_secs=session_duration, + transcoded=transcoded_count, + space_saved=total_saved, + lifetime_transcoded=stats.get('total_transcoded', 0), + lifetime_saved=stats.get('total_space_saved', 0), + pending_count=stats.get('pending_count', 0)) + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scrape_discogs_labels.py b/scrape_discogs_labels.py new file mode 100644 index 0000000..7f6c6ff --- /dev/null +++ b/scrape_discogs_labels.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Scrape releases from Discogs labels and save to JSON for the music pipeline. + +Reads label URLs from Tom's bookmarks (Discogs folder) and scrapes releases. +""" + +import json +import re +import subprocess +import sys +import time +from pathlib import Path + +import requests + +HEADERS = { + "User-Agent": "MusicRecommender/1.0 +https://github.com/openclaw" +} + +SCRIPTS_DIR = Path(__file__).parent + + +def get_labels_from_bookmarks() -> list[tuple[int, str]]: + """Parse bookmarks and extract Discogs label IDs and names. + + Only looks in Music > Discog Labels folder. + """ + labels = [] + + # Run the decrypt script + try: + result = subprocess.run( + ["node", str(SCRIPTS_DIR / "decrypt_bookmarks.js")], + capture_output=True, + text=True, + timeout=30 + ) + bookmarks_xml = result.stdout + except Exception as e: + print(f"Error reading bookmarks: {e}", file=sys.stderr) + return labels + + # Find the "Discog Labels" folder section + # Look for <folder...><title>Discog Labels</title>...</folder> + folder_pattern = r'<folder[^>]*>\s*<title>Discog Labels</title>(.*?)</folder>' + folder_match = re.search(folder_pattern, bookmarks_xml, re.DOTALL | re.IGNORECASE) + + if not folder_match: + print("Could not find 'Discog Labels' folder in bookmarks", file=sys.stderr) + return labels + + folder_content = folder_match.group(1) + + # Find Discogs label URLs within that folder + # Pattern: https://www.discogs.com/label/6458-Indochina + pattern = r'href="https://www\.discogs\.com/label/(\d+)-([^"?]+)' + + for match in re.finditer(pattern, folder_content): + label_id = int(match.group(1)) + label_name = match.group(2).replace("-", " ") + labels.append((label_id, label_name)) + + return labels + +def get_label_releases(label_id: int, label_name: str, max_pages: int = 5) -> list[dict]: + """Fetch releases from a Discogs label.""" + releases = [] + + for page in range(1, max_pages + 1): + url = f"https://api.discogs.com/labels/{label_id}/releases?page={page}&per_page=100" + print(f" Fetching {label_name} page {page}...", file=sys.stderr) + + try: + resp = requests.get(url, headers=HEADERS, timeout=30) + resp.raise_for_status() + data = resp.json() + except Exception as e: + print(f" Error: {e}", file=sys.stderr) + break + + for r in data.get("releases", []): + # Skip compilations, singles, etc - focus on albums + if r.get("format") and "Album" not in str(r.get("format", "")): + # Still include if no format specified + pass + + artist = r.get("artist", "Various") + title = r.get("title", "") + year = r.get("year", "") + + # Clean up artist name + artist = re.sub(r'\s*\(\d+\)$', '', artist) # Remove disambiguation numbers + + if artist and title and artist.lower() != "various": + releases.append({ + "artist": artist, + "album": title, + "year": year, + "label": label_name, + "discogs_id": r.get("id"), + }) + + # Check if more pages + if page >= data.get("pagination", {}).get("pages", 1): + break + + time.sleep(1) # Rate limit + + return releases + +def get_labels_from_config() -> list[tuple[int, str]]: + """Get Discogs labels from music_config.json.""" + labels = [] + config_path = SCRIPTS_DIR / "music_config.json" + + if not config_path.exists(): + return labels + + try: + with open(config_path) as f: + config = json.load(f) + + for entry in config.get("discogs_labels", []): + url = entry.get("url", "") + name = entry.get("name", "") + # Extract ID from URL like https://www.discogs.com/label/6170-Tempa + match = re.search(r'/label/(\d+)', url) + if match and name: + labels.append((int(match.group(1)), name)) + except Exception as e: + print(f"Error reading config: {e}", file=sys.stderr) + + return labels + + +def main(): + # Get labels from bookmarks and config + labels = get_labels_from_bookmarks() + config_labels = get_labels_from_config() + + # Merge, avoiding duplicates by ID + seen_ids = {lid for lid, _ in labels} + for lid, lname in config_labels: + if lid not in seen_ids: + labels.append((lid, lname)) + seen_ids.add(lid) + + if not labels: + print("No Discogs labels found in bookmarks or config!", file=sys.stderr) + sys.exit(1) + + print(f"Found {len(labels)} labels in bookmarks:", file=sys.stderr) + for lid, lname in labels: + print(f" - {lname} ({lid})", file=sys.stderr) + + all_releases = [] + + for label_id, label_name in labels: + print(f"Scraping {label_name}...", file=sys.stderr) + releases = get_label_releases(label_id, label_name) + all_releases.extend(releases) + print(f" Found {len(releases)} releases", file=sys.stderr) + time.sleep(2) # Be nice to Discogs + + # Dedupe by artist+album + seen = set() + unique = [] + for r in all_releases: + key = f"{r['artist'].lower()}|{r['album'].lower()}" + if key not in seen: + seen.add(key) + unique.append(r) + + output = { + "labels": [{"id": lid, "name": lname} for lid, lname in labels], + "releases": unique, + "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + print(json.dumps(output, indent=2)) + +if __name__ == "__main__": + main() diff --git a/transcode_album.sh b/transcode_album.sh new file mode 100755 index 0000000..d1ad1aa --- /dev/null +++ b/transcode_album.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# Transcode FLAC album to Opus, preserving metadata +# Usage: ./transcode_album.sh <source_dir> [--delete-source] +# +# Converts all FLAC files to Opus 128kbps (transparent quality, ~10x smaller) +# Preserves all metadata tags and album art + +set -e + +SOURCE_DIR="$1" +DELETE_SOURCE=false +BITRATE="128k" +GROUP="mediaserver" + +if [[ "$2" == "--delete-source" ]]; then + DELETE_SOURCE=true +fi + +if [[ -z "$SOURCE_DIR" ]] || [[ ! -d "$SOURCE_DIR" ]]; then + echo "Usage: $0 <source_directory> [--delete-source]" + exit 1 +fi + +# Count FLAC files +flac_count=$(find "$SOURCE_DIR" -maxdepth 1 -iname "*.flac" | wc -l) +if [[ $flac_count -eq 0 ]]; then + echo "No FLAC files found in $SOURCE_DIR" + exit 0 +fi + +echo "Transcoding $flac_count FLAC files to Opus ($BITRATE)..." +echo "Source: $SOURCE_DIR" + +converted=0 +failed=0 + +# Process each FLAC file +find "$SOURCE_DIR" -maxdepth 1 -iname "*.flac" | while read -r flac_file; do + filename=$(basename "$flac_file") + opus_file="${flac_file%.flac}.opus" + opus_file="${opus_file%.FLAC}.opus" + + echo " Converting: $filename" + + if ffmpeg -hide_banner -loglevel warning -i "$flac_file" \ + -c:a libopus -b:a "$BITRATE" -vbr on \ + -map_metadata 0 \ + -y "$opus_file" 2>&1; then + + # Preserve original timestamp + touch -r "$flac_file" "$opus_file" + + # Fix permissions + chgrp "$GROUP" "$opus_file" 2>/dev/null || true + chmod 664 "$opus_file" 2>/dev/null || true + + if $DELETE_SOURCE; then + rm "$flac_file" + echo " ✓ Converted and removed FLAC" + else + echo " ✓ Converted (FLAC kept)" + fi + ((converted++)) || true + else + echo " ✗ Failed to convert" + ((failed++)) || true + fi +done + +# Also convert any MP3s that are suspiciously large (>15MB per file avg) +# Actually, skip this - MP3s are already compressed + +# Handle cover art - keep it +# ffmpeg should copy embedded art automatically for opus + +echo "" +echo "Done! Converted $converted files." +if $DELETE_SOURCE; then + # Calculate space saved + echo "FLAC files deleted to save space." +fi |
