#!/usr/bin/env python3 """ Web Archive (Wayback Machine) Lookup Tool ========================================== Find archived snapshots of websites on archive.org Features: - Check if domain/URL was archived - Get all available snapshots - Download specific snapshots - Bulk check domains - Save archived pages Usage: python webarchive_lookup.py check python webarchive_lookup.py snapshots [--year 2023] python webarchive_lookup.py download [--timestamp 20230115] python webarchive_lookup.py bulk --output archived.json """ import os import sys import json import time import argparse import requests from datetime import datetime from pathlib import Path from urllib.parse import urlparse, quote BASE_URL = 'https://archive.org/wayback/available' CDX_URL = 'https://web.archive.org/cdx/search/cdx' def log(msg: str, level: str = 'info') -> None: """Print colored log message""" colors = { 'info': '\033[94m[*]\033[0m', 'success': '\033[92m[+]\033[0m', 'warning': '\033[93m[!]\033[0m', 'error': '\033[91m[-]\033[0m' } print(f"{colors.get(level, '[*]')} {msg}") def normalize_url(url: str) -> str: """Ensure URL has scheme""" if not url.startswith(('http://', 'https://')): url = 'https://' + url return url def check_availability(url: str) -> dict | None: """Check if URL has been archived""" url = normalize_url(url) try: response = requests.get( BASE_URL, params={'url': url}, timeout=30 ) if response.status_code == 200: data = response.json() snapshot = data.get('archived_snapshots', {}).get('closest') if snapshot: return { 'available': True, 'url': snapshot.get('url'), 'timestamp': snapshot.get('timestamp'), 'status': snapshot.get('status') } return {'available': False} else: log(f"API error: {response.status_code}", 'error') return None except Exception as e: log(f"Request error: {e}", 'error') return None def get_snapshots(url: str, year: str = None, limit: int = 100) -> list: """Get all available snapshots for a URL""" url = normalize_url(url) params = { 'url': url, 'output': 'json', 'limit': limit, 'fl': 'timestamp,original,statuscode,mimetype,length' } if year: params['from'] = f'{year}0101' params['to'] = f'{year}1231' try: response = requests.get(CDX_URL, params=params, timeout=60) if response.status_code == 200: lines = response.text.strip().split('\n') if len(lines) <= 1: return [] # First line is header results = [] for line in lines[1:]: try: data = json.loads(line) if line.startswith('[') else line.split() if len(data) >= 5: results.append({ 'timestamp': data[0], 'original': data[1], 'status': data[2], 'mimetype': data[3], 'length': data[4] if len(data) > 4 else '0', 'archive_url': f"https://web.archive.org/web/{data[0]}/{data[1]}" }) except: continue return results else: log(f"CDX API error: {response.status_code}", 'error') return [] except Exception as e: log(f"Request error: {e}", 'error') return [] def get_snapshot_content(url: str, timestamp: str = None) -> str | None: """Download content from a specific snapshot""" url = normalize_url(url) if timestamp: archive_url = f"https://web.archive.org/web/{timestamp}id_/{url}" else: archive_url = f"https://web.archive.org/web/{url}" try: response = requests.get(archive_url, timeout=60) if response.status_code == 200: return response.text else: log(f"Download failed: {response.status_code}", 'error') return None except Exception as e: log(f"Download error: {e}", 'error') return None def bulk_check(urls: list, output_file: str = None) -> dict: """Check archive availability for multiple URLs""" results = {} total = len(urls) for i, url in enumerate(urls, 1): url = url.strip() if not url: continue log(f"[{i}/{total}] Checking: {url}") data = check_availability(url) if data and data.get('available'): results[url] = { 'archived': True, 'archive_url': data.get('url'), 'timestamp': data.get('timestamp'), 'date': datetime.strptime(data['timestamp'], '%Y%m%d%H%M%S').strftime('%Y-%m-%d %H:%M:%S') if data.get('timestamp') else None } log(f"Archived on {results[url]['date']}", 'success') else: results[url] = {'archived': False} log("Not archived", 'warning') time.sleep(1) # Be nice to the API # Save results if output_file: with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) log(f"Results saved to: {output_file}", 'success') archived_count = sum(1 for r in results.values() if r.get('archived')) log(f"\nSummary: {archived_count}/{len(results)} URLs archived", 'success') return results def save_snapshot(url: str, timestamp: str = None, output_dir: str = '.') -> bool: """Save a snapshot to file""" content = get_snapshot_content(url, timestamp) if not content: return False # Create filename from URL parsed = urlparse(normalize_url(url)) domain = parsed.netloc.replace('.', '_') ts = timestamp or 'latest' filename = f"{domain}_{ts}.html" filepath = Path(output_dir) / filename with open(filepath, 'w', encoding='utf-8') as f: f.write(content) log(f"Saved to: {filepath}", 'success') return True def main(): parser = argparse.ArgumentParser( description='Web Archive (Wayback Machine) Lookup Tool', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python webarchive_lookup.py check example.com python webarchive_lookup.py snapshots phishing-site.com --year 2023 python webarchive_lookup.py download scam-site.com --timestamp 20230115120000 python webarchive_lookup.py bulk urls.txt --output archived.json """ ) subparsers = parser.add_subparsers(dest='command', help='Commands') # Check command check_parser = subparsers.add_parser('check', help='Check if URL is archived') check_parser.add_argument('url', help='URL to check') # Snapshots command snap_parser = subparsers.add_parser('snapshots', help='Get all snapshots') snap_parser.add_argument('url', help='URL to lookup') snap_parser.add_argument('--year', '-y', help='Filter by year (e.g., 2023)') snap_parser.add_argument('--limit', '-l', type=int, default=50, help='Max results') # Download command dl_parser = subparsers.add_parser('download', help='Download archived page') dl_parser.add_argument('url', help='URL to download') dl_parser.add_argument('--timestamp', '-t', help='Specific timestamp (YYYYMMDDhhmmss)') dl_parser.add_argument('--output', '-o', default='.', help='Output directory') # Bulk command bulk_parser = subparsers.add_parser('bulk', help='Check multiple URLs') bulk_parser.add_argument('file', help='File with URLs (one per line)') bulk_parser.add_argument('--output', '-o', default='archive_results.json', help='Output file') args = parser.parse_args() if not args.command: parser.print_help() sys.exit(1) if args.command == 'check': data = check_availability(args.url) if data: if data.get('available'): log(f"URL is archived!", 'success') print(f" Archive URL: {data.get('url')}") print(f" Timestamp: {data.get('timestamp')}") if data.get('timestamp'): dt = datetime.strptime(data['timestamp'], '%Y%m%d%H%M%S') print(f" Date: {dt.strftime('%Y-%m-%d %H:%M:%S')}") else: log("URL not found in archive", 'warning') elif args.command == 'snapshots': snapshots = get_snapshots(args.url, args.year, args.limit) if snapshots: log(f"Found {len(snapshots)} snapshots:", 'success') for snap in snapshots[:20]: ts = snap.get('timestamp', '') if ts: dt = datetime.strptime(ts, '%Y%m%d%H%M%S').strftime('%Y-%m-%d') else: dt = 'Unknown' print(f" - {dt}: {snap.get('archive_url')}") if len(snapshots) > 20: print(f" ... and {len(snapshots) - 20} more") else: log("No snapshots found", 'warning') elif args.command == 'download': log(f"Downloading snapshot for: {args.url}") save_snapshot(args.url, args.timestamp, args.output) elif args.command == 'bulk': if not Path(args.file).exists(): log(f"File not found: {args.file}", 'error') sys.exit(1) with open(args.file, 'r') as f: urls = [line.strip() for line in f if line.strip()] log(f"Loaded {len(urls)} URLs", 'info') bulk_check(urls, args.output) if __name__ == '__main__': main()