#!/usr/bin/env python3 """ Filter the 2800+ secondary indicators down to a curated set that is genuinely useful for transport analysis. Uses relevance keywords per category to keep only the most pertinent indicators. """ import json from collections import defaultdict from pathlib import Path RESULTS_DIR = Path(__file__).parent.parent / "results" # Per-category relevance filters: indicator name/definition must match at least one CATEGORY_FILTERS = { "Climate & Environment": { "keywords": [ "co2", "ghg", "greenhouse", "emission", "carbon", "transport", "fuel", "fossil", "air quality", "air pollution", "particulate", "pm2.5", "pm10", "climate", "flood", "cyclone", "disaster", "hazard", "drought", "sea level", "temperature", "rainfall", "weather", "storm", "vulnerability", "adaptation", "resilience", "nitrous", "methane", "energy sector", "combustion", ], "exclude": [ "tobacco", "vaccine", "malaria", "tuberculosis", "hiv", "birth", "fertility", "mortality rate, under", "maternal", "neonatal", "immunization", ], }, "Energy & Fuel": { "keywords": [ "fuel", "oil", "petroleum", "diesel", "gasoline", "kerosene", "electricity", "electric", "power", "energy consumption", "energy use", "energy intensity", "renewable", "solar", "wind", "battery", "charging", "grid", "fossil", "coal", "energy access", "electrification", ], "exclude": [ "cooking", "bank", "account", "loan", ], }, "Conflict & Fragility": { "keywords": [ "conflict", "war", "armed", "violence", "fragil", "peace", "political stability", "security", "terror", "refugee", "displaced", "displacement", "migration", "humanitarian", "state failure", "civil", "military", "battle", "governance", "corruption", "rule of law", "safety", "instability", ], "exclude": [ "gender", "domestic violence", "intimate partner", "school", "bullying", ], }, "Land Use & Urbanization": { "keywords": [ "urban", "rural", "city", "cities", "metropolitan", "population density", "land area", "arable", "forest", "deforestation", "land use", "built-up", "settlement", "slum", "agglomeration", "suburban", "sprawl", "urbanization", "urban population", "rural population", "surface area", "territorial", "housing", ], "exclude": [], }, "Water & Waterways": { "keywords": [ "water", "waterway", "river", "canal", "port", "flood", "dam", "hydropower", "irrigation", "freshwater", "coastal", "sea", "ocean", "marine", "fishery", "fish", "inland water", "navigable", "drainage", "watershed", "blue economy", "aquatic", ], "exclude": [ "drinking water", "sanitation", "handwash", "soap", "safely managed water", ], }, "Trade & Economic Corridors": { "keywords": [ "trade", "export", "import", "tariff", "customs", "border", "cross-border", "freight", "cargo", "supply chain", "value chain", "corridor", "fdi", "foreign direct", "investment", "competitiveness", "doing business", "ease of", "market access", "trade facilitation", "non-tariff", "logistics", "special economic zone", "free trade", ], "exclude": [ "bank", "loan", "deposit", "insurance", "stock market", "equity", "bond", ], }, "Infrastructure (General)": { "keywords": [ "infrastructure", "public private partnership", "ppp", "concession", "investment in infrastructure", "capital expenditure", "construction", ], "exclude": [], }, "Population & Demographics": { "keywords": [ "population", "density", "labor force", "employment", "unemployment", "workforce", "working age", "rural population", "urban population", "migration", "remittance", "diaspora", "demographic", "age dependency", "growth rate", ], "exclude": [ "vaccine", "contraceptive", "antenatal", ], }, "Governance & Institutions": { "keywords": [ "government expenditure", "government spending", "public spending", "budget", "fiscal", "governance", "regulatory quality", "rule of law", "control of corruption", "government effectiveness", "public administration", "institutional", "transparency", "accountability", "tax revenue", "public debt", "infrastructure spending", ], "exclude": [], }, } def matches(indicator, keywords, exclude): text = ( f"{indicator.get('name', '')} {indicator.get('definition_short', '')}" ).lower() if any(ex in text for ex in exclude): return False return any(kw in text for kw in keywords) def main(): with open(RESULTS_DIR / "secondary-indicators-all.json") as f: all_secondary = json.load(f) print(f"Input: {len(all_secondary)} secondary indicators\n") curated = [] by_category = defaultdict(list) for ind in all_secondary: cat = ind.get("secondary_category", "") filt = CATEGORY_FILTERS.get(cat) if not filt: continue if matches(ind, filt["keywords"], filt["exclude"]): curated.append(ind) by_category[cat].append(ind) print(f"Curated total: {len(curated)}\n") # Print summary for cat in sorted(by_category.keys()): inds = by_category[cat] print(f"\n{'='*70}") print(f"{cat}: {len(inds)} indicators") print(f"{'='*70}") # Group by subcategory by_sub = defaultdict(list) for i in inds: by_sub[i.get("secondary_subcategory", "other")].append(i) for sub in sorted(by_sub.keys()): sub_inds = by_sub[sub] print(f"\n {sub} ({len(sub_inds)}):") for i in sorted(sub_inds, key=lambda x: x.get("idno") or "")[:8]: name = (i.get("name") or "")[:75] print(f" {i.get('idno', '?')}: {name}") if len(sub_inds) > 8: print(f" ... and {len(sub_inds) - 8} more") # Save curated set with open(RESULTS_DIR / "secondary-curated.json", "w") as f: json.dump(curated, f, indent=2) # Save curated summary by category summary = {"total": len(curated), "categories": {}} for cat, inds in sorted(by_category.items()): by_sub = defaultdict(list) for i in inds: by_sub[i.get("secondary_subcategory", "other")].append(i) cat_summary = {"total": len(inds), "subcategories": {}} for sub, sub_inds in sorted(by_sub.items(), key=lambda x: -len(x[1])): cat_summary["subcategories"][sub] = { "count": len(sub_inds), "indicators": [ { "idno": i["idno"], "name": i["name"], "database_id": i.get("database_id"), "definition_short": i.get("definition_short"), "unit": i.get("measurement_unit"), } for i in sorted(sub_inds, key=lambda x: x.get("idno") or "") ], } summary["categories"][cat] = cat_summary with open(RESULTS_DIR / "secondary-curated-summary.json", "w") as f: json.dump(summary, f, indent=2) print(f"\n\nSaved: secondary-curated.json ({len(curated)} indicators)") print(f"Saved: secondary-curated-summary.json") if __name__ == "__main__": main()