""" Download all audio recordings from S3 to a local directory. Usage: python scripts/download_recordings.py python scripts/download_recordings.py --prefix benapi92 # filter by session code python scripts/download_recordings.py --out data/raw """ import argparse import os from pathlib import Path import boto3 from botocore.config import Config as BotoConfig from dotenv import load_dotenv load_dotenv() BUCKET = 'ethz-otree-whisper' REGION = 'eu-north-1' DEFAULT_OUT = Path('data/raw') def download(prefix: str = '', out_dir: Path = DEFAULT_OUT): s3 = boto3.client( 's3', aws_access_key_id=os.environ['S3_ACCESS_KEY'], aws_secret_access_key=os.environ['S3_SECRET_KEY'], region_name=REGION, config=BotoConfig(signature_version='s3v4'), ) out_dir.mkdir(parents=True, exist_ok=True) paginator = s3.get_paginator('list_objects_v2') pages = paginator.paginate(Bucket=BUCKET, Prefix=prefix) total = 0 for page in pages: for obj in page.get('Contents', []): key = obj['Key'] dest = out_dir / key dest.parent.mkdir(parents=True, exist_ok=True) if dest.exists(): print(f' skip {key} (already exists)') continue print(f' down {key}') s3.download_file(BUCKET, key, str(dest)) total += 1 print(f'\nDone. {total} file(s) downloaded to {out_dir}/') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--prefix', default='', help='Filter by key prefix (e.g. session code)') parser.add_argument('--out', default=str(DEFAULT_OUT), help='Local output directory') args = parser.parse_args() download(prefix=args.prefix, out_dir=Path(args.out))