#!/usr/bin/env python3
"""
Filter the raw exploration results down to indicators that are genuinely
transport-related, based on the Data360 topic taxonomy.

Transport is under: P4 (Infrastructure) > P4_000003 (Transport)
Sub-topics (L2):
  P4_000023: Air Transport
  P4_000025: Green and Inclusive Mobility
  P4_000026: Highway Asset Management and Rural Access
  P4_000027: Maritime Transport and Logistics
  P4_000029: Railways
  P4_000030: Regional Connectivity and Transport Corridors
  P4_000031: Road Safety
  P4_000032: Transport Economics
  P4_000033: Urban Mobility
"""

import json
from collections import defaultdict
from pathlib import Path

RESULTS_DIR = Path(__file__).parent.parent / "results"

# The transport topic IDs in the Data360 taxonomy
TRANSPORT_TOPIC_IDS = {
    "P4_000003",  # Transport (L1)
    "P4_000023",  # Air Transport
    "P4_000025",  # Green and Inclusive Mobility
    "P4_000026",  # Highway Asset Management and Rural Access
    "P4_000027",  # Maritime Transport and Logistics
    "P4_000029",  # Railways
    "P4_000030",  # Regional Connectivity and Transport Corridors
    "P4_000031",  # Road Safety
    "P4_000032",  # Transport Economics
    "P4_000033",  # Urban Mobility
}

# Also include indicators that are clearly transport even if tagged differently
TRANSPORT_KEYWORDS = [
    "transport", "road", "railway", "railroad", "aviation", "airport",
    "shipping", "freight", "logistics", "port", "maritime", "vehicle",
    "traffic", "highway", "bus", "rail ", "airline", "cargo",
    "mobility", "passenger", "fleet",
]


def load_all_indicators():
    with open(RESULTS_DIR / "all-transport-indicators.json") as f:
        return json.load(f)


def is_transport_by_topic(indicator):
    """Check if indicator has a transport topic tag."""
    for t in indicator.get("topics", []):
        if t.get("id") in TRANSPORT_TOPIC_IDS:
            return True
    return False


def is_transport_by_keyword(indicator):
    """Check if indicator name/definition matches transport keywords."""
    text = f"{indicator.get('name', '')} {indicator.get('definition_short', '')}".lower()
    return any(kw in text for kw in TRANSPORT_KEYWORDS)


def get_transport_subtopics(indicator):
    """Return the L2 transport subtopics for an indicator."""
    subtopics = []
    for t in indicator.get("topics", []):
        tid = t.get("id", "")
        if tid in TRANSPORT_TOPIC_IDS and tid != "P4_000003":
            subtopics.append(t.get("name", tid))
    return subtopics


def main():
    indicators = load_all_indicators()
    print(f"Total indicators from search: {len(indicators)}")

    # Filter to transport-relevant
    transport_indicators = []
    for ind in indicators:
        by_topic = is_transport_by_topic(ind)
        by_keyword = is_transport_by_keyword(ind)
        if by_topic or by_keyword:
            ind["_matched_by"] = "topic" if by_topic else "keyword_only"
            ind["_transport_subtopics"] = get_transport_subtopics(ind)
            transport_indicators.append(ind)

    print(f"Transport-relevant indicators: {len(transport_indicators)}")

    # Group by subtopic
    by_subtopic = defaultdict(list)
    for ind in transport_indicators:
        subtopics = ind["_transport_subtopics"]
        if not subtopics:
            by_subtopic["General / Unclassified"].append(ind)
        else:
            for st in subtopics:
                by_subtopic[st].append(ind)

    print("\n=== TRANSPORT SUB-TOPICS ===")
    for st, inds in sorted(by_subtopic.items(), key=lambda x: -len(x[1])):
        print(f"\n  {st}: {len(inds)} indicators")
        for i in inds[:5]:
            print(f"    - {i['idno']}: {i['name']}")
        if len(inds) > 5:
            print(f"    ... and {len(inds) - 5} more")

    # Group by database
    by_database = defaultdict(list)
    for ind in transport_indicators:
        by_database[ind.get("database_id", "unknown")].append(ind)

    print("\n\n=== DATABASES WITH TRANSPORT INDICATORS ===")
    for db, inds in sorted(by_database.items(), key=lambda x: -len(x[1])):
        db_name = inds[0].get("database_name", db) if inds else db
        print(f"  {db} ({db_name}): {len(inds)} indicators")

    # Save filtered results
    with open(RESULTS_DIR / "transport-indicators-filtered.json", "w") as f:
        json.dump(transport_indicators, f, indent=2)

    # Save a clean summary
    summary = {
        "total_transport_indicators": len(transport_indicators),
        "subtopic_counts": {k: len(v) for k, v in sorted(by_subtopic.items(), key=lambda x: -len(x[1]))},
        "database_counts": {k: len(v) for k, v in sorted(by_database.items(), key=lambda x: -len(x[1]))},
        "subtopics": {},
        "databases": {},
    }

    for st, inds in sorted(by_subtopic.items()):
        summary["subtopics"][st] = [
            {"idno": i["idno"], "name": i["name"], "database_id": i["database_id"],
             "unit": i.get("measurement_unit"), "periodicity": i.get("periodicity")}
            for i in sorted(inds, key=lambda x: x.get("idno", ""))
        ]

    for db, inds in sorted(by_database.items(), key=lambda x: (x[0] or "")):
        summary["databases"][db or "unknown"] = {
            "database_name": inds[0].get("database_name", db) if inds else db,
            "indicator_count": len(inds),
            "indicators": [
                {"idno": i["idno"], "name": i["name"],
                 "subtopics": i["_transport_subtopics"]}
                for i in sorted(inds, key=lambda x: x.get("idno", ""))
            ]
        }

    with open(RESULTS_DIR / "transport-summary.json", "w") as f:
        json.dump(summary, f, indent=2)

    print(f"\nSaved: transport-indicators-filtered.json ({len(transport_indicators)} indicators)")
    print(f"Saved: transport-summary.json")


if __name__ == "__main__":
    main()