#!/usr/bin/env python3 """ Explore secondary/adjacent datasets relevant to transport analysis. These sit outside the core Transport topic (P4_000003) but are useful for understanding transport in context: climate, conflict, land use, water/waterways, energy, urbanization, trade corridors, etc. """ import json import requests from collections import defaultdict from pathlib import Path BASE = "https://data360api.worldbank.org/data360" RESULTS_DIR = Path(__file__).parent.parent / "results" RESULTS_DIR.mkdir(exist_ok=True) # Already captured transport topic IDs - exclude these CORE_TRANSPORT_TOPICS = { "P4_000003", "P4_000023", "P4_000025", "P4_000026", "P4_000027", "P4_000029", "P4_000030", "P4_000031", "P4_000032", "P4_000033", } def search(query, top=100, skip=0, filter_expr=None): body = { "search": query, "select": ",".join([ "series_description/idno", "series_description/name", "series_description/database_id", "series_description/database_name", "series_description/definition_short", "series_description/topics", "series_description/measurement_unit", "series_description/periodicity", ]), "top": top, "skip": skip, "count": True, } if filter_expr: body["filter"] = filter_expr resp = requests.post(f"{BASE}/searchv2", json=body) resp.raise_for_status() return resp.json() def search_all(query, filter_expr=None, batch=100, max_results=500): all_results = [] skip = 0 total = None while True: data = search(query, top=batch, skip=skip, filter_expr=filter_expr) if total is None: total = data.get("@odata.count", 0) values = data.get("value", []) if not values: break all_results.extend(values) skip += batch if skip >= min(total, max_results): break return all_results, total def search_by_topic(topic_id): return search_all( "*", filter_expr=f"series_description/topics/any(t: t/id eq '{topic_id}')", max_results=250, ) def parse_result(r, category, subcategory): sd = r.get("series_description", {}) return { "idno": sd.get("idno"), "name": sd.get("name"), "database_id": sd.get("database_id"), "database_name": sd.get("database_name"), "definition_short": sd.get("definition_short"), "measurement_unit": sd.get("measurement_unit"), "periodicity": sd.get("periodicity"), "topics": sd.get("topics", []), "secondary_category": category, "secondary_subcategory": subcategory, } def main(): # Load existing core transport indicators to exclude duplicates try: with open(RESULTS_DIR / "transport-topic-filtered-all.json") as f: core = {ind["idno"] for ind in json.load(f) if ind.get("idno")} with open(RESULTS_DIR / "transport-indicators-filtered.json") as f: core.update(ind["idno"] for ind in json.load(f) if ind.get("idno")) except FileNotFoundError: core = set() print(f"Excluding {len(core)} core transport indicators\n") # Define secondary categories with search strategies # Each: (category, subcategory, method, query_or_topic_id) searches = [ # ── CLIMATE & ENVIRONMENT ── ("Climate & Environment", "GHG Emissions", "search", "greenhouse gas emissions"), ("Climate & Environment", "GHG Emissions", "search", "CO2 emissions"), ("Climate & Environment", "GHG Emissions", "search", "carbon emissions fuel"), ("Climate & Environment", "Climate Adaptation", "topic", "P1_000044"), # Climate Adaptation and Resilience ("Climate & Environment", "Climate Change", "topic", "P1_000002"), # Climate Change L1 ("Climate & Environment", "Pollution", "topic", "P1_000025"), # Pollution Management ("Climate & Environment", "Fossil Fuel Subsidies", "search", "fossil fuel subsidy"), ("Climate & Environment", "Air Pollution", "search", "air pollution"), ("Climate & Environment", "Natural Disasters", "search", "flood disaster cyclone"), ("Climate & Environment", "Natural Hazards", "topic", "P4_000036"), # DRM and Resilience # ── ENERGY & FUEL ── ("Energy & Fuel", "Energy Access", "topic", "P4_000004"), # Energy Access ("Energy & Fuel", "Energy Economics", "topic", "P4_000005"), # Energy Economics ("Energy & Fuel", "Renewable Energy", "topic", "P4_000013"), # Renewable Energy ("Energy & Fuel", "Fuel & Oil", "search", "fuel oil petroleum diesel"), ("Energy & Fuel", "Electricity", "search", "electricity consumption access"), # ── CONFLICT & FRAGILITY ── ("Conflict & Fragility", "Conflict", "search", "conflict war armed violence"), ("Conflict & Fragility", "Fragility", "search", "fragile state fragility"), ("Conflict & Fragility", "Political Stability", "search", "political stability governance"), ("Conflict & Fragility", "Displacement", "search", "refugee displacement internally displaced"), ("Conflict & Fragility", "Social Cohesion", "topic", "P1_000029"), # Social Cohesion # ── LAND USE & URBANIZATION ── ("Land Use & Urbanization", "Urban Development", "topic", "P4_000034"), # Urban, Resilience and Land ("Land Use & Urbanization", "Urban Infrastructure", "topic", "P4_000038"), # Sustainable Urban Infra ("Land Use & Urbanization", "Land & Geospatial", "topic", "P4_000037"), # Land and Geospatial ("Land Use & Urbanization", "Urbanization", "search", "urban population urbanization city"), ("Land Use & Urbanization", "Land Use", "search", "land use agricultural arable forest"), ("Land Use & Urbanization", "Housing", "topic", "P4_000040"), # Housing # ── WATER & WATERWAYS ── ("Water & Waterways", "Water Resources", "topic", "P1_000035"), # Water Resources Management ("Water & Waterways", "Dams & Hydropower", "topic", "P1_000034"), # Dams and Hydropower ("Water & Waterways", "Blue Economy", "topic", "P1_000022"), # Blue Economy ("Water & Waterways", "Inland Waterways", "search", "inland waterway river navigation canal"), ("Water & Waterways", "Water & Economy", "topic", "P1_000051"), # Water and the Economy # ── TRADE & ECONOMIC CORRIDORS ── ("Trade & Economic Corridors", "Trade Outcomes", "topic", "P3_000028"), # Trade Outcomes ("Trade & Economic Corridors", "Trade Policy", "topic", "P3_000024"), # Trade Policies ("Trade & Economic Corridors", "Investment Climate", "topic", "P3_000022"), # Investment and Business ("Trade & Economic Corridors", "Exports & Imports", "search", "export import trade goods"), ("Trade & Economic Corridors", "FDI & Investment", "search", "foreign direct investment FDI"), ("Trade & Economic Corridors", "Supply Chain", "search", "supply chain value chain"), # ── INFRASTRUCTURE (non-transport) ── ("Infrastructure (General)", "PPPs", "topic", "P4_000022"), # Public Private Partnerships ("Infrastructure (General)", "Infra Finance", "topic", "P4_000021"), # Infrastructure Funding ("Infrastructure (General)", "Infra Analytics", "topic", "P4_000020"), # Infrastructure Analytics ("Infrastructure (General)", "Power Sector", "topic", "P4_000012"), # Power Sector Policies # ── POPULATION & DEMOGRAPHICS ── ("Population & Demographics", "Population", "search", "population density rural urban growth"), ("Population & Demographics", "Migration", "search", "migration remittance diaspora"), ("Population & Demographics", "Employment", "search", "employment labor workforce jobs"), # ── GOVERNANCE & INSTITUTIONS ── ("Governance & Institutions", "Public Finance", "topic", "P3_000015"), # Public Finance Management ("Governance & Institutions", "Governance", "topic", "P3_000014"), # Economic/Sociopolitical Governance ("Governance & Institutions", "Government Spending", "search", "government expenditure spending budget infrastructure"), ] all_secondary = {} # idno -> indicator category_counts = defaultdict(lambda: defaultdict(int)) for category, subcategory, method, query_or_topic in searches: label = f"{category} > {subcategory}" print(f" [{method}] {label}: ", end="", flush=True) try: if method == "topic": results, total = search_by_topic(query_or_topic) else: results, total = search_all(query_or_topic, max_results=200) new_count = 0 for r in results: ind = parse_result(r, category, subcategory) idno = ind["idno"] if not idno or idno in core: continue if idno not in all_secondary: all_secondary[idno] = ind new_count += 1 category_counts[category][subcategory] += 1 # Track multiple categories for same indicator elif category not in all_secondary[idno].get("_also_in_categories", []): all_secondary[idno].setdefault("_also_in_categories", []).append( f"{category} > {subcategory}" ) print(f"{total} results, {new_count} new unique") except Exception as e: print(f"ERROR: {e}") print(f"\n{'='*70}") print(f"TOTAL SECONDARY INDICATORS: {len(all_secondary)}") print(f"{'='*70}") # Print summary by category print("\nBy Category:") for cat in sorted(category_counts.keys()): cat_total = sum(category_counts[cat].values()) print(f"\n {cat}: {cat_total} indicators") for subcat, count in sorted(category_counts[cat].items(), key=lambda x: -x[1]): print(f" {subcat}: {count}") # Save all secondary indicators with open(RESULTS_DIR / "secondary-indicators-all.json", "w") as f: json.dump(list(all_secondary.values()), f, indent=2) # Save by category by_category = defaultdict(lambda: defaultdict(list)) for ind in all_secondary.values(): by_category[ind["secondary_category"]][ind["secondary_subcategory"]].append({ "idno": ind["idno"], "name": ind["name"], "database_id": ind["database_id"], "database_name": ind["database_name"], "definition_short": ind.get("definition_short"), "measurement_unit": ind.get("measurement_unit"), "periodicity": ind.get("periodicity"), }) # Convert to regular dict for JSON by_cat_dict = {} for cat, subcats in sorted(by_category.items()): by_cat_dict[cat] = {} for subcat, inds in sorted(subcats.items()): by_cat_dict[cat][subcat] = sorted(inds, key=lambda x: x.get("idno") or "") with open(RESULTS_DIR / "secondary-by-category.json", "w") as f: json.dump(by_cat_dict, f, indent=2) # Save compact summary summary = { "total_secondary_indicators": len(all_secondary), "categories": {}, } for cat, subcats in sorted(by_category.items()): cat_info = {"total": sum(len(v) for v in subcats.values()), "subcategories": {}} for subcat, inds in sorted(subcats.items(), key=lambda x: -len(x[1])): cat_info["subcategories"][subcat] = { "count": len(inds), "sample_indicators": [ f"{i['idno']}: {i['name']}" for i in sorted(inds, key=lambda x: x.get("idno") or "")[:5] ], } summary["categories"][cat] = cat_info with open(RESULTS_DIR / "secondary-summary.json", "w") as f: json.dump(summary, f, indent=2) print(f"\nSaved: secondary-indicators-all.json ({len(all_secondary)} indicators)") print(f"Saved: secondary-by-category.json") print(f"Saved: secondary-summary.json") if __name__ == "__main__": main()