import argparse
import urllib.parse
import requests
import threading
import time
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- CONFIGURATION ---
WORKERS = 15
CHUNK_SIZE = 100
MAX_CONSECUTIVE_DEAD_CHUNKS = 5  # 5 chunks of 100 = 500 consecutive dead IDs before stopping

thread_local = threading.local()

def get_session():
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
    return thread_local.session

def sanitize_url(original_url):
    """Safely encodes spaces and normalizes http/https for perfect deduplication."""
    parsed = urllib.parse.urlparse(original_url)
    raw_path = urllib.parse.unquote(parsed.path)
    clean_path = urllib.parse.quote(raw_path, safe="/")
    
    return urllib.parse.urlunparse((
        'http',
        parsed.netloc.lower(),
        clean_path,
        parsed.params,
        parsed.query,
        parsed.fragment
    ))

def generate_predictables(start_date_str):
    """Generates Hourly, Update, and Licensedhnc URLs from start_date to tomorrow."""
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()
    # Use UTC tomorrow to account for timezone weirdness
    end_date = datetime.utcnow().date() + timedelta(days=1) 
    
    urls = []
    curr = start_date
    while curr <= end_date:
        y, m, d = curr.strftime("%Y"), curr.strftime("%m"), curr.strftime("%d")
        for h in range(24):
            hh = f"{h:02d}"
            urls.append(f"http://audio.cbsradionewsfeed.com/{y}/{m}/{d}/{hh}/Hourly-{hh}.mp3")
            urls.append(f"http://audio.cbsradionewsfeed.com/{y}/{m}/{d}/{hh}/Update-{hh}.mp3")
            urls.append(f"http://audio.cbsradionewsfeed.com/{y}/{m}/{d}/{hh}/Licensedhnc-{hh}.mp3")
        curr += timedelta(days=1)
        
    return urls, end_date.strftime("%Y-%m-%d")

def check_eyecast_id(eid, retries=3):
    """Checks a single Eyecast ID and returns the destination URL if valid."""
    url = f"https://www.cbsradionewsfeed.com/eyecast/distribution/{eid}.mp3"
    session = get_session()
    
    for attempt in range(retries):
        try:
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
            response = session.head(url, headers=headers, allow_redirects=False, timeout=10)
            
            if response.status_code == 429:
                time.sleep(2 ** attempt)
                continue
                
            if response.status_code in [301, 302]:
                target_url = response.headers.get('Location', '')
                if target_url.lower().endswith('.mp3'):
                    return eid, target_url
            return eid, None
            
        except requests.RequestException:
            time.sleep(1)
            continue
            
    return eid, None

def main():
    parser = argparse.ArgumentParser(description="CBS Radio News Archive Updater")
    parser.add_argument("--date", required=True, help="Start date (YYYY-MM-DD)")
    parser.add_argument("--id", type=int, required=True, help="Start Eyecast ID (e.g., 3731144)")
    args = parser.parse_args()

    print(f"🚀 CBS Archive Updater Initialized")
    print(f"📅 Generating predictables from {args.date} to Tomorrow...")
    
    # 1. Generate Predictables
    predictable_urls, end_date_str = generate_predictables(args.date)
    print(f"  -> Generated {len(predictable_urls):,} predictable URLs.")

    # 2. Unroll Eyecast Forward
    print(f"\n🔍 Scanning Eyecast IDs forward from {args.id}...")
    print(f"   (Will automatically stop after {MAX_CONSECUTIVE_DEAD_CHUNKS * CHUNK_SIZE} consecutive empty IDs)")
    
    eyecast_urls = []
    current_start_id = args.id
    highest_valid_id = args.id
    dead_chunks = 0
    
    with ThreadPoolExecutor(max_workers=WORKERS) as executor:
        while dead_chunks < MAX_CONSECUTIVE_DEAD_CHUNKS:
            chunk_end = current_start_id + CHUNK_SIZE
            futures = [executor.submit(check_eyecast_id, eid) for eid in range(current_start_id, chunk_end)]
            
            chunk_found = 0
            for future in as_completed(futures):
                eid, target = future.result()
                if target:
                    eyecast_urls.append(target)
                    chunk_found += 1
                    if eid > highest_valid_id:
                        highest_valid_id = eid
            
            if chunk_found == 0:
                dead_chunks += 1
                print(f"  ⚠️ Chunk {current_start_id}-{chunk_end-1} empty. (Strike {dead_chunks}/{MAX_CONSECUTIVE_DEAD_CHUNKS})")
            else:
                dead_chunks = 0 # Reset the dead chunk counter if we find something!
                print(f"  ✅ Chunk {current_start_id}-{chunk_end-1} | Found: {chunk_found}")
                
            current_start_id = chunk_end

    print(f"🛑 Reached the end of the published database at ID ~{highest_valid_id}.")
    print(f"  -> Found {len(eyecast_urls):,} new Eyecast URLs.")

    # 3. Sanitize and Deduplicate
    print("\n🧹 Sanitizing and deduplicating all URLs...")
    final_urls = set()
    
    for url in predictable_urls + eyecast_urls:
        final_urls.add(sanitize_url(url))
        
    sorted_urls = sorted(list(final_urls))

    # 4. Save to dynamically named file
    filename = f"cbs_update_{args.date.replace('-','')}_to_{end_date_str.replace('-','')}_Eyecast_{args.id}_to_{highest_valid_id}.txt"
    
    with open(filename, "w", encoding="utf-8") as f:
        for url in sorted_urls:
            f.write(f"{url}\n")

    print(f"\n🎉 Update Complete!")
    print(f"💾 Saved {len(sorted_urls):,} perfectly formatted URLs to '{filename}'")
    print(f"▶️  Run: wget -i {filename} -x -nc -c -t 2 --waitretry=1 --random-wait --restrict-file-names=nocontrol")

if __name__ == "__main__":
    main()
