102 lines
No EOL
4 KiB
Python
102 lines
No EOL
4 KiB
Python
import asyncio
|
|
import aiohttp
|
|
import csv
|
|
import json
|
|
import re
|
|
from io import StringIO
|
|
from collections import OrderedDict, defaultdict
|
|
|
|
URLS = OrderedDict([
|
|
('text', 'https://www.iana.org/assignments/media-types/text.csv'),
|
|
('image', 'https://www.iana.org/assignments/media-types/image.csv'),
|
|
('audio', 'https://www.iana.org/assignments/media-types/audio.csv'),
|
|
('video', 'https://www.iana.org/assignments/media-types/video.csv'),
|
|
('application', 'https://www.iana.org/assignments/media-types/application.csv'),
|
|
('font', 'https://www.iana.org/assignments/media-types/font.csv'),
|
|
('model', 'https://www.iana.org/assignments/media-types/model.csv'),
|
|
('multipart', 'https://www.iana.org/assignments/media-types/multipart.csv'),
|
|
('message', 'https://www.iana.org/assignments/media-types/message.csv'),
|
|
('haptics', 'https://www.iana.org/assignments/media-types/haptics.csv')
|
|
])
|
|
|
|
OUTPUT_JSON_FILE = "mime_types.json"
|
|
REQUEST_TIMEOUT_SECONDS = 20
|
|
USER_AGENT = "MimeTypeFetcher/1.0.0"
|
|
|
|
OBSOLETE_DEPRECATED_PATTERN = re.compile(
|
|
r"\(obsolete(d)?([^\)]*)\)|"
|
|
r"\bobsolete(d)?\b|"
|
|
r"\(deprecated([^\)]*)\)|"
|
|
r"\bdeprecated\b",
|
|
re.IGNORECASE
|
|
)
|
|
|
|
def is_obsolete_or_deprecated(name_str):
|
|
return bool(OBSOLETE_DEPRECATED_PATTERN.search(name_str))
|
|
|
|
async def fetch_category_data(session, category_key, url):
|
|
print(f"Fetching: {category_key}...")
|
|
headers = {"User-Agent": USER_AGENT}
|
|
try:
|
|
async with session.get(url, timeout=REQUEST_TIMEOUT_SECONDS, headers=headers) as response:
|
|
response.raise_for_status()
|
|
text_content = await response.text(encoding="utf-8")
|
|
reader = csv.DictReader(StringIO(text_content))
|
|
return category_key, list(reader)
|
|
except Exception as e:
|
|
print(f"Error for {category_key} ({url}): {type(e).__name__} - {e}")
|
|
return category_key, []
|
|
|
|
async def process_all_mime_types():
|
|
final_data = OrderedDict()
|
|
# Pre-initialize obsolete_data to preserve category order
|
|
obsolete_data = OrderedDict((key, []) for key in URLS.keys())
|
|
|
|
print("Starting MIME type download...\n")
|
|
async with aiohttp.ClientSession() as session:
|
|
tasks = [fetch_category_data(session, key, url) for key, url in URLS.items()]
|
|
results = await asyncio.gather(*tasks)
|
|
|
|
print("\nProcessing downloaded data...")
|
|
total_processed_entries = 0
|
|
|
|
for category_key, raw_rows in results:
|
|
active_entries = []
|
|
if not raw_rows:
|
|
final_data[category_key] = []
|
|
continue
|
|
|
|
for row in raw_rows:
|
|
name = row.get('Name', '').strip()
|
|
template = row.get('Template', '').strip()
|
|
|
|
if name and template:
|
|
entry = {'name': name, 'template': template}
|
|
if is_obsolete_or_deprecated(name):
|
|
obsolete_data[category_key].append(entry)
|
|
else:
|
|
active_entries.append(entry)
|
|
|
|
final_data[category_key] = active_entries
|
|
total_processed_entries += len(active_entries)
|
|
print(f"Processed {len(active_entries)} active types for {category_key}.")
|
|
|
|
# Filter out obsolete categories with no entries and add to final_data
|
|
final_obsolete_data = OrderedDict([(k, v) for k, v in obsolete_data.items() if v])
|
|
if final_obsolete_data:
|
|
final_data["obsolete_deprecated"] = final_obsolete_data
|
|
obsolete_count = sum(len(v) for v in final_obsolete_data.values())
|
|
total_processed_entries += obsolete_count
|
|
print(f"Categorized {obsolete_count} obsolete/deprecated types by original category.")
|
|
|
|
print(f"\nTotal entries processed: {total_processed_entries}")
|
|
|
|
try:
|
|
with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(final_data, f, indent=2, ensure_ascii=False)
|
|
print(f"Data saved to {OUTPUT_JSON_FILE}")
|
|
except IOError as e:
|
|
print(f"Error writing JSON: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(process_all_mime_types()) |