Substrate/get-de-wastewater

#!/usr/bin/env bun

/**
 * Get DE Wastewater Surveillance Data (AMELAG / RKI + Umweltbundesamt)
 *
 * Fetches the population-weighted nationwide aggregated viral load time series
 * from the AMELAG GitHub repository and produces:
 *   - Data/DE-Wastewater-Surveillance/wastewater-latest.csv
 *
 * Source: https://github.com/robert-koch-institut/Abwassersurveillance_AMELAG
 * Data: amelag_aggregierte_kurve.tsv (weekly, population-weighted national aggregate)
 * License: CC-BY 4.0 (Robert Koch-Institut / Umweltbundesamt)
 * No authentication required.
 *
 * Strategy:
 *  1. Fetch the aggregated TSV from GitHub raw.
 *  2. Parse all pathogen types (SARS-CoV-2, Influenza A/B, RSV A/B).
 *  3. For each pathogen, compute latest value, 4-week trend, and peak.
 *  4. Write a tidy CSV with all data points.
 *  5. Print a summary with current status and trends.
 */

import { writeFileSync, mkdirSync } from "fs";
import { join } from "path";

const AMELAG_URL =
  "https://raw.githubusercontent.com/robert-koch-institut/Abwassersurveillance_AMELAG/main/amelag_aggregierte_kurve.tsv";

const OUT_DIR = join(__dirname, "Data/DE-Wastewater-Surveillance");

// Pathogen types we report individually (others are combined variants)
const PRIMARY_PATHOGENS = ["SARS-CoV-2", "Influenza A", "Influenza B", "RSV A", "RSV B"];

interface AmelagRow {
  datum: string;
  n: number;
  anteil_bev: number;
  viruslast: number;
  viruslast_normalisiert: number;
  vorhersage: number;
  obere_schranke: number;
  untere_schranke: number;
  typ: string;
}

function parseTSV(raw: string): AmelagRow[] {
  const lines = raw.trim().split("\n");
  const headers = lines[0].split("\t");

  const idx = (name: string) => {
    const i = headers.indexOf(name);
    if (i === -1) throw new Error(`Column '${name}' not found in TSV headers: ${headers.join(", ")}`);
    return i;
  };

  const rows: AmelagRow[] = [];
  for (let i = 1; i < lines.length; i++) {
    const cols = lines[i].split("\t");
    if (cols.length < headers.length) continue;

    const viruslast = parseFloat(cols[idx("viruslast")]);
    if (isNaN(viruslast)) continue;

    rows.push({
      datum: cols[idx("datum")],
      n: parseInt(cols[idx("n")], 10),
      anteil_bev: parseFloat(cols[idx("anteil_bev")]),
      viruslast,
      viruslast_normalisiert: parseFloat(cols[idx("viruslast_normalisiert")]),
      vorhersage: parseFloat(cols[idx("vorhersage")]),
      obere_schranke: parseFloat(cols[idx("obere_schranke")]),
      untere_schranke: parseFloat(cols[idx("untere_schranke")]),
      typ: cols[idx("typ")],
    });
  }

  return rows;
}

interface PathogenSummary {
  typ: string;
  latest_date: string;
  latest_viruslast: number;
  latest_vorhersage: number;
  sites_reporting: number;
  pop_coverage_pct: number;
  trend_4w_pct: number;
  peak_viruslast: number;
  peak_date: string;
  data_points: number;
}

function analyzePathogen(rows: AmelagRow[], typ: string): PathogenSummary | null {
  const filtered = rows
    .filter((r) => r.typ === typ)
    .sort((a, b) => a.datum.localeCompare(b.datum));

  if (filtered.length === 0) return null;

  const latest = filtered[filtered.length - 1];

  // 4-week trend: compare latest vorhersage to 4 weeks prior
  let trend_4w_pct = 0;
  if (filtered.length >= 5) {
    const fourWeeksAgo = filtered[filtered.length - 5];
    if (fourWeeksAgo.vorhersage > 0) {
      trend_4w_pct =
        ((latest.vorhersage - fourWeeksAgo.vorhersage) / fourWeeksAgo.vorhersage) * 100;
    }
  }

  // Peak
  const peak = filtered.reduce((best, r) =>
    r.viruslast > best.viruslast ? r : best
  );

  return {
    typ,
    latest_date: latest.datum,
    latest_viruslast: latest.viruslast,
    latest_vorhersage: latest.vorhersage,
    sites_reporting: latest.n,
    pop_coverage_pct: latest.anteil_bev * 100,
    trend_4w_pct,
    peak_viruslast: peak.viruslast,
    peak_date: peak.datum,
    data_points: filtered.length,
  };
}

function trendArrow(pct: number): string {
  if (pct > 20) return "↑↑";
  if (pct > 5) return "↑";
  if (pct > -5) return "→";
  if (pct > -20) return "↓";
  return "↓↓";
}

async function main() {
  console.log("Fetching DE wastewater surveillance data (AMELAG)…\n");

  mkdirSync(OUT_DIR, { recursive: true });

  // Fetch TSV
  const res = await fetch(AMELAG_URL);
  if (!res.ok) throw new Error(`HTTP ${res.status} fetching AMELAG data`);
  const raw = await res.text();
  const rows = parseTSV(raw);
  console.log(`Parsed ${rows.length} data points from AMELAG aggregated curve.\n`);

  // Get all unique pathogen types
  const allTypes = [...new Set(rows.map((r) => r.typ))].sort();
  console.log(`Pathogen types found: ${allTypes.join(", ")}\n`);

  // Write full CSV (all pathogens, all data points)
  const csvHeader =
    "datum,typ,n_sites,pop_coverage_pct,viruslast_gc_per_l,viruslast_normalisiert,vorhersage_gam,obere_schranke,untere_schranke";
  const csvRows = rows
    .sort((a, b) => a.datum.localeCompare(b.datum) || a.typ.localeCompare(b.typ))
    .map(
      (r) =>
        `${r.datum},${r.typ},${r.n},${(r.anteil_bev * 100).toFixed(2)},${r.viruslast.toFixed(2)},${r.viruslast_normalisiert.toFixed(2)},${r.vorhersage.toFixed(2)},${r.obere_schranke.toFixed(2)},${r.untere_schranke.toFixed(2)}`
    );

  const csvPath = join(OUT_DIR, "wastewater-latest.csv");
  writeFileSync(csvPath, [csvHeader, ...csvRows].join("\n") + "\n");
  console.log(`Wrote ${csvPath} (${csvRows.length} rows)\n`);

  // Analyze primary pathogens
  const summaries: PathogenSummary[] = [];
  for (const typ of PRIMARY_PATHOGENS) {
    const s = analyzePathogen(rows, typ);
    if (s) summaries.push(s);
  }

  // Write summary CSV
  const summaryHeader =
    "typ,latest_date,latest_viruslast_gc_per_l,latest_vorhersage_gam,sites_reporting,pop_coverage_pct,trend_4w_pct,peak_viruslast,peak_date,data_points";
  const summaryRows = summaries.map(
    (s) =>
      `${s.typ},${s.latest_date},${s.latest_viruslast.toFixed(2)},${s.latest_vorhersage.toFixed(2)},${s.sites_reporting},${s.pop_coverage_pct.toFixed(2)},${s.trend_4w_pct.toFixed(1)},${s.peak_viruslast.toFixed(2)},${s.peak_date},${s.data_points}`
  );

  const summaryPath = join(OUT_DIR, "wastewater-summary.csv");
  writeFileSync(summaryPath, [summaryHeader, ...summaryRows].join("\n") + "\n");
  console.log(`Wrote ${summaryPath}\n`);

  // Print console summary
  console.log("── Current Status ──────────────────────────────────────\n");

  for (const s of summaries) {
    const arrow = trendArrow(s.trend_4w_pct);
    console.log(
      `${s.typ.padEnd(16)} ${arrow} ${s.trend_4w_pct > 0 ? "+" : ""}${s.trend_4w_pct.toFixed(1)}%  ` +
        `Latest: ${s.latest_viruslast.toFixed(0)} gc/L  ` +
        `GAM: ${s.latest_vorhersage.toFixed(0)} gc/L  ` +
        `(${s.sites_reporting} sites, ${s.pop_coverage_pct.toFixed(1)}% pop)  ` +
        `Peak: ${s.peak_viruslast.toFixed(0)} gc/L (${s.peak_date})`
    );
  }

  const covid = summaries.find((s) => s.typ === "SARS-CoV-2");
  if (covid) {
    const pctOfPeak = (covid.latest_viruslast / covid.peak_viruslast) * 100;
    console.log(
      `\nSARS-CoV-2 is at ${pctOfPeak.toFixed(1)}% of its all-time peak (${covid.peak_date}).`
    );
  }

  console.log(
    "\nData: AMELAG (RKI + Umweltbundesamt), CC-BY 4.0"
  );
  console.log("Units: gene copies per liter (gc/L), population-weighted national aggregate.");
}

main().catch((err) => {
  console.error("Error:", err.message);
  process.exit(1);
});