Initial commit
This commit is contained in:
commit
71667d81e4
3 changed files with 414 additions and 0 deletions
102
fetcher.py
Normal file
102
fetcher.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import asyncio
|
||||
import aiohttp
|
||||
import csv
|
||||
import json
|
||||
import re
|
||||
from io import StringIO
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
URLS = OrderedDict([
|
||||
('text', 'https://www.iana.org/assignments/media-types/text.csv'),
|
||||
('image', 'https://www.iana.org/assignments/media-types/image.csv'),
|
||||
('audio', 'https://www.iana.org/assignments/media-types/audio.csv'),
|
||||
('video', 'https://www.iana.org/assignments/media-types/video.csv'),
|
||||
('application', 'https://www.iana.org/assignments/media-types/application.csv'),
|
||||
('font', 'https://www.iana.org/assignments/media-types/font.csv'),
|
||||
('model', 'https://www.iana.org/assignments/media-types/model.csv'),
|
||||
('multipart', 'https://www.iana.org/assignments/media-types/multipart.csv'),
|
||||
('message', 'https://www.iana.org/assignments/media-types/message.csv'),
|
||||
('haptics', 'https://www.iana.org/assignments/media-types/haptics.csv')
|
||||
])
|
||||
|
||||
OUTPUT_JSON_FILE = "mime_types.json"
|
||||
REQUEST_TIMEOUT_SECONDS = 20
|
||||
USER_AGENT = "MimeTypeFetcher/1.0.0"
|
||||
|
||||
OBSOLETE_DEPRECATED_PATTERN = re.compile(
|
||||
r"\(obsolete(d)?([^\)]*)\)|"
|
||||
r"\bobsolete(d)?\b|"
|
||||
r"\(deprecated([^\)]*)\)|"
|
||||
r"\bdeprecated\b",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
def is_obsolete_or_deprecated(name_str):
|
||||
return bool(OBSOLETE_DEPRECATED_PATTERN.search(name_str))
|
||||
|
||||
async def fetch_category_data(session, category_key, url):
|
||||
print(f"Fetching: {category_key}...")
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
try:
|
||||
async with session.get(url, timeout=REQUEST_TIMEOUT_SECONDS, headers=headers) as response:
|
||||
response.raise_for_status()
|
||||
text_content = await response.text(encoding="utf-8")
|
||||
reader = csv.DictReader(StringIO(text_content))
|
||||
return category_key, list(reader)
|
||||
except Exception as e:
|
||||
print(f"Error for {category_key} ({url}): {type(e).__name__} - {e}")
|
||||
return category_key, []
|
||||
|
||||
async def process_all_mime_types():
|
||||
final_data = OrderedDict()
|
||||
# Pre-initialize obsolete_data to preserve category order
|
||||
obsolete_data = OrderedDict((key, []) for key in URLS.keys())
|
||||
|
||||
print("Starting MIME type download...\n")
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [fetch_category_data(session, key, url) for key, url in URLS.items()]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
print("\nProcessing downloaded data...")
|
||||
total_processed_entries = 0
|
||||
|
||||
for category_key, raw_rows in results:
|
||||
active_entries = []
|
||||
if not raw_rows:
|
||||
final_data[category_key] = []
|
||||
continue
|
||||
|
||||
for row in raw_rows:
|
||||
name = row.get('Name', '').strip()
|
||||
template = row.get('Template', '').strip()
|
||||
|
||||
if name and template:
|
||||
entry = {'name': name, 'template': template}
|
||||
if is_obsolete_or_deprecated(name):
|
||||
obsolete_data[category_key].append(entry)
|
||||
else:
|
||||
active_entries.append(entry)
|
||||
|
||||
final_data[category_key] = active_entries
|
||||
total_processed_entries += len(active_entries)
|
||||
print(f"Processed {len(active_entries)} active types for {category_key}.")
|
||||
|
||||
# Filter out obsolete categories with no entries and add to final_data
|
||||
final_obsolete_data = OrderedDict([(k, v) for k, v in obsolete_data.items() if v])
|
||||
if final_obsolete_data:
|
||||
final_data["obsolete_deprecated"] = final_obsolete_data
|
||||
obsolete_count = sum(len(v) for v in final_obsolete_data.values())
|
||||
total_processed_entries += obsolete_count
|
||||
print(f"Categorized {obsolete_count} obsolete/deprecated types by original category.")
|
||||
|
||||
print(f"\nTotal entries processed: {total_processed_entries}")
|
||||
|
||||
try:
|
||||
with open(OUTPUT_JSON_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(final_data, f, indent=2, ensure_ascii=False)
|
||||
print(f"Data saved to {OUTPUT_JSON_FILE}")
|
||||
except IOError as e:
|
||||
print(f"Error writing JSON: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(process_all_mime_types())
|
||||
Loading…
Add table
Add a link
Reference in a new issue