import asyncio import aiohttp import csv import json import re from io import StringIO from collections import OrderedDict, defaultdict URLS = OrderedDict([ ('text', 'https://www.iana.org/assignments/media-types/text.csv'), ('image', 'https://www.iana.org/assignments/media-types/image.csv'), ('audio', 'https://www.iana.org/assignments/media-types/audio.csv'), ('video', 'https://www.iana.org/assignments/media-types/video.csv'), ('application', 'https://www.iana.org/assignments/media-types/application.csv'), ('font', 'https://www.iana.org/assignments/media-types/font.csv'), ('model', 'https://www.iana.org/assignments/media-types/model.csv'), ('multipart', 'https://www.iana.org/assignments/media-types/multipart.csv'), ('message', 'https://www.iana.org/assignments/media-types/message.csv'), ('haptics', 'https://www.iana.org/assignments/media-types/haptics.csv') ]) OUTPUT_JSON_FILE = "mime_types.json" REQUEST_TIMEOUT_SECONDS = 20 USER_AGENT = "MimeTypeFetcher/1.0.0" OBSOLETE_DEPRECATED_PATTERN = re.compile( r"\(obsolete(d)?([^\)]*)\)|" r"\bobsolete(d)?\b|" r"\(deprecated([^\)]*)\)|" r"\bdeprecated\b", re.IGNORECASE ) def is_obsolete_or_deprecated(name_str): return bool(OBSOLETE_DEPRECATED_PATTERN.search(name_str)) async def fetch_category_data(session, category_key, url): print(f"Fetching: {category_key}...") headers = {"User-Agent": USER_AGENT} try: async with session.get(url, timeout=REQUEST_TIMEOUT_SECONDS, headers=headers) as response: response.raise_for_status() text_content = await response.text(encoding="utf-8") reader = csv.DictReader(StringIO(text_content)) return category_key, list(reader) except Exception as e: print(f"Error for {category_key} ({url}): {type(e).__name__} - {e}") return category_key, [] async def process_all_mime_types(): final_data = OrderedDict() # Pre-initialize obsolete_data to preserve category order obsolete_data = OrderedDict((key, []) for key in URLS.keys()) print("Starting MIME type download...\n") async with aiohttp.ClientSession() as session: tasks = [fetch_category_data(session, key, url) for key, url in URLS.items()] results = await asyncio.gather(*tasks) print("\nProcessing downloaded data...") total_processed_entries = 0 for category_key, raw_rows in results: active_entries = [] if not raw_rows: final_data[category_key] = [] continue for row in raw_rows: name = row.get('Name', '').strip() template = row.get('Template', '').strip() if name and template: entry = {'name': name, 'template': template} if is_obsolete_or_deprecated(name): obsolete_data[category_key].append(entry) else: active_entries.append(entry) final_data[category_key] = active_entries total_processed_entries += len(active_entries) print(f"Processed {len(active_entries)} active types for {category_key}.") # Filter out obsolete categories with no entries and add to final_data final_obsolete_data = OrderedDict([(k, v) for k, v in obsolete_data.items() if v]) if final_obsolete_data: final_data["obsolete_deprecated"] = final_obsolete_data obsolete_count = sum(len(v) for v in final_obsolete_data.values()) total_processed_entries += obsolete_count print(f"Categorized {obsolete_count} obsolete/deprecated types by original category.") print(f"\nTotal entries processed: {total_processed_entries}") try: with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f: json.dump(final_data, f, indent=2, ensure_ascii=False) print(f"Data saved to {OUTPUT_JSON_FILE}") except IOError as e: print(f"Error writing JSON: {e}") if __name__ == "__main__": asyncio.run(process_all_mime_types())