Files
Substrate/Data/Bay-Area-COVID-Wastewater/update-covid-data
Daniel Miessler 9066ad477b Add Bay Area COVID wastewater and Pulitzer Prize datasets
Added two comprehensive datasets with full documentation:

1. Bay Area COVID-19 Wastewater Surveillance (2022-2025)
   - California statewide COVID-19 wastewater data
   - 161 weekly data points from CDPH
   - Leading health indicator for viral trends
   - Includes automated update scripts

2. Pulitzer Prize Winners - Arts & Letters (1918-2024)
   - 249 winners across 107 years
   - Poetry, Drama, and General/Special categories
   - High-quality curated data from Wikidata
   - CSV files for each category

Added master Data directory documentation (Data/README.md) describing:
- Data philosophy and quality standards
- All four current datasets
- Contribution guidelines
- File naming conventions

Includes utility commands:
- get-bay-area-covid-status: Analyze current COVID wastewater levels
- get-california-wastewater-data: Fetch latest surveillance data

Updated .gitignore to exclude large raw data files (278MB+).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-16 22:09:43 -07:00

153 lines
4.9 KiB
Plaintext
Executable File

#!/usr/bin/env bun
/**
* Update COVID-19 Wastewater Data
*
* Fetches the latest California statewide wastewater data from CDPH
* and updates the local CSV dataset.
*/
import { writeFileSync, readFileSync } from 'fs';
import { join } from 'path';
const DATA_URL = 'https://data.chhs.ca.gov/dataset/1184f641-313f-47ee-b126-9e8c42699be5/resource/726752d3-afe6-4733-99bd-ffb9f400348c/download/wastewater.csv';
const CSV_PATH = join(__dirname, 'COVID-Wastewater-California-Statewide-2022-2025.csv');
const UPDATES_PATH = join(__dirname, 'UPDATES.md');
interface RawWastewaterData {
season: string;
week_ending_date: string;
sars_cov2_log10_copies_ml: string;
}
async function fetchLatestData(): Promise<string> {
console.log('📡 Fetching latest COVID wastewater data from CDPH...\n');
const response = await fetch(DATA_URL);
if (!response.ok) {
throw new Error(`Failed to fetch data: ${response.statusText}`);
}
return await response.text();
}
function parseCSV(csvContent: string): RawWastewaterData[] {
const lines = csvContent.trim().split('\n');
const headers = lines[0].toLowerCase().split(',');
return lines.slice(1).map(line => {
const values = line.split(',');
return {
season: values[0],
week_ending_date: values[1],
sars_cov2_log10_copies_ml: values[2]
};
}).filter(row => row.week_ending_date && row.sars_cov2_log10_copies_ml);
}
function formatDate(dateStr: string): string {
const date = new Date(dateStr);
return date.toLocaleDateString('en-US', {
month: 'short',
day: 'numeric',
year: 'numeric'
});
}
function convertToISO(dateStr: string): string {
// Convert MM/DD/YYYY to YYYY-MM-DD
const parts = dateStr.split('/');
if (parts.length === 3) {
const [month, day, year] = parts;
return `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}`;
}
return dateStr;
}
function processAndSaveData(rawData: RawWastewaterData[]): void {
// Sort by date (oldest to newest for the file)
rawData.sort((a, b) => new Date(a.week_ending_date).getTime() - new Date(b.week_ending_date).getTime());
// Format CSV with proper headers
const csvLines = [
'season,week_ending_date,sars_cov2_log10_copies_ml,data_source,region,notes'
];
for (const row of rawData) {
const isoDate = convertToISO(row.week_ending_date);
const roundedValue = parseFloat(row.sars_cov2_log10_copies_ml).toFixed(2);
csvLines.push(
`${row.season},${isoDate},${roundedValue},CDPH California Wastewater Surveillance,California Statewide,Log10 viral gene copies/mL`
);
}
writeFileSync(CSV_PATH, csvLines.join('\n') + '\n');
console.log(`✅ Updated dataset: ${rawData.length} records saved\n`);
}
function updateChangelog(latestRecord: RawWastewaterData, recordCount: number): void {
const now = new Date();
const updateDate = now.toISOString().split('T')[0];
const latestDate = convertToISO(latestRecord.week_ending_date);
const latestValue = parseFloat(latestRecord.sars_cov2_log10_copies_ml).toFixed(2);
const changelogEntry = `
## ${updateDate} - Automated Data Update
**Data Period**: 2022-07-09 to ${latestDate}
**Source**: CDPH California Wastewater Surveillance
**URL**: ${DATA_URL}
### Changes
- Updated dataset with latest wastewater measurements
- Total records: ${recordCount}
### Latest Value
- **Week Ending**: ${formatDate(latestDate)}
- **SARS-CoV-2**: ${latestValue} log10 copies/mL
---
`;
try {
const currentChangelog = readFileSync(UPDATES_PATH, 'utf-8');
const futureUpdatesMarker = '## Future Updates';
const parts = currentChangelog.split(futureUpdatesMarker);
if (parts.length === 2) {
const updatedChangelog = parts[0] + changelogEntry + futureUpdatesMarker + parts[1];
writeFileSync(UPDATES_PATH, updatedChangelog);
console.log('📝 Updated UPDATES.md changelog\n');
}
} catch (error) {
console.log('⚠️ Could not update changelog:', error);
}
}
try {
const rawCSV = await fetchLatestData();
const data = parseCSV(rawCSV);
if (data.length === 0) {
console.error('❌ No valid data found in source CSV');
process.exit(1);
}
// Get the latest record (after sorting)
const latestRecord = data[data.length - 1];
processAndSaveData(data);
updateChangelog(latestRecord, data.length);
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
console.log('✅ COVID DATA UPDATE COMPLETE');
console.log('━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
console.log(`📅 Latest data point: ${formatDate(convertToISO(latestRecord.week_ending_date))}`);
console.log(`📊 Latest viral load: ${parseFloat(latestRecord.sars_cov2_log10_copies_ml).toFixed(2)} log10 copies/mL`);
console.log(`📈 Total records: ${data.length}\n`);
} catch (error) {
console.error('❌ Error updating COVID data:', error);
process.exit(1);
}