#!/usr/bin/env python3 """ Filter the raw exploration results down to indicators that are genuinely transport-related, based on the Data360 topic taxonomy. Transport is under: P4 (Infrastructure) > P4_000003 (Transport) Sub-topics (L2): P4_000023: Air Transport P4_000025: Green and Inclusive Mobility P4_000026: Highway Asset Management and Rural Access P4_000027: Maritime Transport and Logistics P4_000029: Railways P4_000030: Regional Connectivity and Transport Corridors P4_000031: Road Safety P4_000032: Transport Economics P4_000033: Urban Mobility """ import json from collections import defaultdict from pathlib import Path RESULTS_DIR = Path(__file__).parent.parent / "results" # The transport topic IDs in the Data360 taxonomy TRANSPORT_TOPIC_IDS = { "P4_000003", # Transport (L1) "P4_000023", # Air Transport "P4_000025", # Green and Inclusive Mobility "P4_000026", # Highway Asset Management and Rural Access "P4_000027", # Maritime Transport and Logistics "P4_000029", # Railways "P4_000030", # Regional Connectivity and Transport Corridors "P4_000031", # Road Safety "P4_000032", # Transport Economics "P4_000033", # Urban Mobility } # Also include indicators that are clearly transport even if tagged differently TRANSPORT_KEYWORDS = [ "transport", "road", "railway", "railroad", "aviation", "airport", "shipping", "freight", "logistics", "port", "maritime", "vehicle", "traffic", "highway", "bus", "rail ", "airline", "cargo", "mobility", "passenger", "fleet", ] def load_all_indicators(): with open(RESULTS_DIR / "all-transport-indicators.json") as f: return json.load(f) def is_transport_by_topic(indicator): """Check if indicator has a transport topic tag.""" for t in indicator.get("topics", []): if t.get("id") in TRANSPORT_TOPIC_IDS: return True return False def is_transport_by_keyword(indicator): """Check if indicator name/definition matches transport keywords.""" text = f"{indicator.get('name', '')} {indicator.get('definition_short', '')}".lower() return any(kw in text for kw in TRANSPORT_KEYWORDS) def get_transport_subtopics(indicator): """Return the L2 transport subtopics for an indicator.""" subtopics = [] for t in indicator.get("topics", []): tid = t.get("id", "") if tid in TRANSPORT_TOPIC_IDS and tid != "P4_000003": subtopics.append(t.get("name", tid)) return subtopics def main(): indicators = load_all_indicators() print(f"Total indicators from search: {len(indicators)}") # Filter to transport-relevant transport_indicators = [] for ind in indicators: by_topic = is_transport_by_topic(ind) by_keyword = is_transport_by_keyword(ind) if by_topic or by_keyword: ind["_matched_by"] = "topic" if by_topic else "keyword_only" ind["_transport_subtopics"] = get_transport_subtopics(ind) transport_indicators.append(ind) print(f"Transport-relevant indicators: {len(transport_indicators)}") # Group by subtopic by_subtopic = defaultdict(list) for ind in transport_indicators: subtopics = ind["_transport_subtopics"] if not subtopics: by_subtopic["General / Unclassified"].append(ind) else: for st in subtopics: by_subtopic[st].append(ind) print("\n=== TRANSPORT SUB-TOPICS ===") for st, inds in sorted(by_subtopic.items(), key=lambda x: -len(x[1])): print(f"\n {st}: {len(inds)} indicators") for i in inds[:5]: print(f" - {i['idno']}: {i['name']}") if len(inds) > 5: print(f" ... and {len(inds) - 5} more") # Group by database by_database = defaultdict(list) for ind in transport_indicators: by_database[ind.get("database_id", "unknown")].append(ind) print("\n\n=== DATABASES WITH TRANSPORT INDICATORS ===") for db, inds in sorted(by_database.items(), key=lambda x: -len(x[1])): db_name = inds[0].get("database_name", db) if inds else db print(f" {db} ({db_name}): {len(inds)} indicators") # Save filtered results with open(RESULTS_DIR / "transport-indicators-filtered.json", "w") as f: json.dump(transport_indicators, f, indent=2) # Save a clean summary summary = { "total_transport_indicators": len(transport_indicators), "subtopic_counts": {k: len(v) for k, v in sorted(by_subtopic.items(), key=lambda x: -len(x[1]))}, "database_counts": {k: len(v) for k, v in sorted(by_database.items(), key=lambda x: -len(x[1]))}, "subtopics": {}, "databases": {}, } for st, inds in sorted(by_subtopic.items()): summary["subtopics"][st] = [ {"idno": i["idno"], "name": i["name"], "database_id": i["database_id"], "unit": i.get("measurement_unit"), "periodicity": i.get("periodicity")} for i in sorted(inds, key=lambda x: x.get("idno", "")) ] for db, inds in sorted(by_database.items(), key=lambda x: (x[0] or "")): summary["databases"][db or "unknown"] = { "database_name": inds[0].get("database_name", db) if inds else db, "indicator_count": len(inds), "indicators": [ {"idno": i["idno"], "name": i["name"], "subtopics": i["_transport_subtopics"]} for i in sorted(inds, key=lambda x: x.get("idno", "")) ] } with open(RESULTS_DIR / "transport-summary.json", "w") as f: json.dump(summary, f, indent=2) print(f"\nSaved: transport-indicators-filtered.json ({len(transport_indicators)} indicators)") print(f"Saved: transport-summary.json") if __name__ == "__main__": main()