mime-type-reference/fetcher.py
2025-06-14 13:09:17 -05:00

102 lines
No EOL
4 KiB
Python

import asyncio
import aiohttp
import csv
import json
import re
from io import StringIO
from collections import OrderedDict, defaultdict
URLS = OrderedDict([
('text', 'https://www.iana.org/assignments/media-types/text.csv'),
('image', 'https://www.iana.org/assignments/media-types/image.csv'),
('audio', 'https://www.iana.org/assignments/media-types/audio.csv'),
('video', 'https://www.iana.org/assignments/media-types/video.csv'),
('application', 'https://www.iana.org/assignments/media-types/application.csv'),
('font', 'https://www.iana.org/assignments/media-types/font.csv'),
('model', 'https://www.iana.org/assignments/media-types/model.csv'),
('multipart', 'https://www.iana.org/assignments/media-types/multipart.csv'),
('message', 'https://www.iana.org/assignments/media-types/message.csv'),
('haptics', 'https://www.iana.org/assignments/media-types/haptics.csv')
])
OUTPUT_JSON_FILE = "mime_types.json"
REQUEST_TIMEOUT_SECONDS = 20
USER_AGENT = "MimeTypeFetcher/1.0.0"
OBSOLETE_DEPRECATED_PATTERN = re.compile(
r"\(obsolete(d)?([^\)]*)\)|"
r"\bobsolete(d)?\b|"
r"\(deprecated([^\)]*)\)|"
r"\bdeprecated\b",
re.IGNORECASE
)
def is_obsolete_or_deprecated(name_str):
return bool(OBSOLETE_DEPRECATED_PATTERN.search(name_str))
async def fetch_category_data(session, category_key, url):
print(f"Fetching: {category_key}...")
headers = {"User-Agent": USER_AGENT}
try:
async with session.get(url, timeout=REQUEST_TIMEOUT_SECONDS, headers=headers) as response:
response.raise_for_status()
text_content = await response.text(encoding="utf-8")
reader = csv.DictReader(StringIO(text_content))
return category_key, list(reader)
except Exception as e:
print(f"Error for {category_key} ({url}): {type(e).__name__} - {e}")
return category_key, []
async def process_all_mime_types():
final_data = OrderedDict()
# Pre-initialize obsolete_data to preserve category order
obsolete_data = OrderedDict((key, []) for key in URLS.keys())
print("Starting MIME type download...\n")
async with aiohttp.ClientSession() as session:
tasks = [fetch_category_data(session, key, url) for key, url in URLS.items()]
results = await asyncio.gather(*tasks)
print("\nProcessing downloaded data...")
total_processed_entries = 0
for category_key, raw_rows in results:
active_entries = []
if not raw_rows:
final_data[category_key] = []
continue
for row in raw_rows:
name = row.get('Name', '').strip()
template = row.get('Template', '').strip()
if name and template:
entry = {'name': name, 'template': template}
if is_obsolete_or_deprecated(name):
obsolete_data[category_key].append(entry)
else:
active_entries.append(entry)
final_data[category_key] = active_entries
total_processed_entries += len(active_entries)
print(f"Processed {len(active_entries)} active types for {category_key}.")
# Filter out obsolete categories with no entries and add to final_data
final_obsolete_data = OrderedDict([(k, v) for k, v in obsolete_data.items() if v])
if final_obsolete_data:
final_data["obsolete_deprecated"] = final_obsolete_data
obsolete_count = sum(len(v) for v in final_obsolete_data.values())
total_processed_entries += obsolete_count
print(f"Categorized {obsolete_count} obsolete/deprecated types by original category.")
print(f"\nTotal entries processed: {total_processed_entries}")
try:
with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
json.dump(final_data, f, indent=2, ensure_ascii=False)
print(f"Data saved to {OUTPUT_JSON_FILE}")
except IOError as e:
print(f"Error writing JSON: {e}")
if __name__ == "__main__":
asyncio.run(process_all_mime_types())