#!/usr/bin/env python3 """ World Bank Data360 API Explorer - Transport Sector Systematically discovers all transport-related indicators, topics, and databases. """ import json import requests import sys import os from collections import defaultdict from pathlib import Path BASE = "https://data360api.worldbank.org/data360" RESULTS_DIR = Path(__file__).parent.parent / "results" RESULTS_DIR.mkdir(exist_ok=True) def search(query, top=100, skip=0): """Search the Data360 search endpoint.""" resp = requests.post(f"{BASE}/searchv2", json={ "search": query, "select": ",".join([ "series_description/idno", "series_description/name", "series_description/database_id", "series_description/database_name", "series_description/definition_short", "series_description/topics", "series_description/measurement_unit", "series_description/periodicity", ]), "top": top, "skip": skip, "count": True }) resp.raise_for_status() return resp.json() def search_all(query, batch=100): """Paginate through all search results.""" all_results = [] skip = 0 total = None while True: data = search(query, top=batch, skip=skip) if total is None: total = data.get("@odata.count", 0) print(f" Total results for '{query}': {total}") values = data.get("value", []) if not values: break all_results.extend(values) skip += batch if skip >= total: break return all_results, total def get_indicators(dataset_id): """Get all indicators for a dataset.""" resp = requests.get(f"{BASE}/indicators", params={"datasetId": dataset_id}) resp.raise_for_status() return resp.json() def save_json(data, filename): """Save data to JSON file in results directory.""" path = RESULTS_DIR / filename with open(path, "w") as f: json.dump(data, f, indent=2) print(f" Saved: {path}") def main(): print("=" * 70) print("WORLD BANK DATA360 - TRANSPORT SECTOR EXPLORATION") print("=" * 70) # ── Step 1: Search for transport-related indicators ── print("\n[1] Searching for transport-related indicators...") search_terms = [ "transport", "road", "railway", "aviation", "shipping", "logistics", "freight", "infrastructure transport", "road safety", "vehicle", "port", "air transport", "maritime", "traffic", "mobility", ] all_indicators = {} search_summary = {} for term in search_terms: print(f"\n Searching: '{term}'") results, total = search_all(term) search_summary[term] = {"total": total, "fetched": len(results)} for r in results: sd = r.get("series_description", {}) idno = sd.get("idno") if idno and idno not in all_indicators: all_indicators[idno] = { "idno": idno, "name": sd.get("name"), "database_id": sd.get("database_id"), "database_name": sd.get("database_name"), "definition_short": sd.get("definition_short"), "measurement_unit": sd.get("measurement_unit"), "periodicity": sd.get("periodicity"), "topics": sd.get("topics", []), "found_via": term, } print(f"\n Total unique indicators found: {len(all_indicators)}") save_json(search_summary, "search-summary.json") save_json(list(all_indicators.values()), "all-transport-indicators.json") # ── Step 2: Extract topic taxonomy ── print("\n[2] Extracting topic taxonomy...") topics = {} for ind in all_indicators.values(): for t in ind.get("topics", []): tid = t.get("id") if tid and tid not in topics: topics[tid] = { "id": tid, "name": t.get("name"), "parent_id": t.get("parent_id"), "vocabulary": t.get("vocabulary"), } # Build topic tree topic_tree = defaultdict(list) for t in topics.values(): parent = t.get("parent_id") topic_tree[parent].append(t) print(f" Unique topics: {len(topics)}") save_json(dict(topics), "topics-taxonomy.json") # Print topic tree print("\n Topic Hierarchy:") def print_tree(parent_id, indent=0): children = sorted(topic_tree.get(parent_id, []), key=lambda x: x["id"]) for c in children: print(f" {' ' * indent}{c['id']}: {c['name']} [{c.get('vocabulary', '')}]") print_tree(c["id"], indent + 1) print_tree(None) # ── Step 3: Extract databases ── print("\n[3] Extracting databases containing transport data...") databases = defaultdict(lambda: {"indicator_count": 0, "indicators": [], "topics": set()}) for ind in all_indicators.values(): db_id = ind.get("database_id") if db_id: databases[db_id]["indicator_count"] += 1 databases[db_id]["indicators"].append(ind["idno"]) for t in ind.get("topics", []): databases[db_id]["topics"].add(t.get("name", "")) # Convert sets to lists for JSON db_summary = {} for db_id, info in sorted(databases.items(), key=lambda x: -x[1]["indicator_count"]): db_summary[db_id] = { "indicator_count": info["indicator_count"], "topics": sorted(info["topics"]), "sample_indicators": info["indicators"][:10], } print(f" {db_id}: {info['indicator_count']} indicators") save_json(db_summary, "databases-with-transport.json") # ── Step 4: For each database, get full indicator list ── print("\n[4] Fetching full indicator lists for key databases...") for db_id in sorted(databases.keys()): print(f"\n Fetching indicators for: {db_id}") try: indicators = get_indicators(db_id) save_json(indicators, f"indicators-{db_id}.json") count = len(indicators) if isinstance(indicators, list) else "?" print(f" Got {count} indicators") except Exception as e: print(f" Error: {e}") # ── Step 5: Generate summary report ── print("\n[5] Generating summary report...") # Group indicators by L1 and L2 topics topic_indicators = defaultdict(list) for ind in all_indicators.values(): for t in ind.get("topics", []): vocab = t.get("vocabulary", "") if "L1" in vocab or "L2" in vocab: topic_indicators[f"{t['id']}: {t['name']}"].append(ind["idno"]) report = { "summary": { "total_unique_indicators": len(all_indicators), "total_databases": len(databases), "total_topics": len(topics), "search_terms_used": search_terms, }, "databases": db_summary, "topic_indicator_counts": {k: len(v) for k, v in sorted(topic_indicators.items())}, "topics_taxonomy": dict(topics), } save_json(report, "transport-exploration-report.json") print("\n" + "=" * 70) print("EXPLORATION COMPLETE") print(f" Unique indicators: {len(all_indicators)}") print(f" Databases: {len(databases)}") print(f" Topics: {len(topics)}") print("=" * 70) if __name__ == "__main__": main()