Created the file for 2025 and started parsing out the content from archive website but was rate limited. Will need to finish in the future.

This commit is contained in:
2026-02-24 15:24:56 -08:00
parent e7252c4a74
commit 17c8270742
7 changed files with 7149 additions and 0 deletions

179
rescrape-missing.ps1 Normal file
View File

@@ -0,0 +1,179 @@
# rescrape-missing.ps1 - Re-fetches 0-course and partial restaurants
# using CDX API to find best available Wayback Machine snapshot
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
function Decode-Html($str) {
if (-not $str) { return $str }
$s = $str -replace '&amp;','&' -replace '&#039;',"'" -replace '&quot;','"' -replace '&lt;','<' -replace '&gt;','>' -replace '&nbsp;',' ' -replace '\s+',' '
$s.Trim()
}
function Get-CleanText($rawHtml) {
$t = $rawHtml -replace '<[^>]+>', ' '
$t = Decode-Html $t
$t.Trim()
}
function Invoke-Dishes($courseHtml) {
$dishes = [System.Collections.ArrayList]@()
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
foreach ($pm in $pMatches) {
$pContent = $pm.Groups[1].Value
if ($pContent -notmatch '<strong>') { continue }
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
if (-not $nameM.Success) { continue }
$name = Get-CleanText $nameM.Groups[1].Value
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025|Drink)') { continue }
if ($name.Length -lt 3 -or $name.Length -gt 80) { continue }
if ($name -match '^[A-Z]{1,3}:') { continue }
$afterBr = ''
if ($pContent -match '(?s)<br\s*/?>(.*?)$') { $afterBr = $matches[1] }
else {
$afterStrong = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
}
$desc = Get-CleanText $afterBr
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
}
return ,$dishes
}
function Invoke-CourseBlock($html, $courseLabel, $nextLabel) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
if ($nextLabel) {
$pattern = [regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'
$m = [regex]::Match($html, $pattern, $opts)
if ($m.Success) { return $m.Groups[1].Value }
}
$idx = $html.IndexOf($courseLabel)
if ($idx -ge 0) {
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
$innerM = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
if ($innerM.Success) { return $innerM.Groups[1].Value }
}
return ''
}
function Parse-RestaurantHtml($html) {
$fc = Invoke-CourseBlock $html 'First Course' 'Second Course'
$sc = Invoke-CourseBlock $html 'Second Course' 'Third Course'
$tc = Invoke-CourseBlock $html 'Third Course' $null
return @{
first = Invoke-Dishes $fc
second = Invoke-Dishes $sc
third = Invoke-Dishes $tc
hours = if ($html -match 'Menu served ([^<]+)') { "Menu served $($matches[1].Trim())" } else { '' }
phone = if ($html -match '\((?:208|509)\) \d{3}-\d{4}') { $matches[0] } else { '' }
}
}
# Find which restaurants need re-scraping
$needsRescrape = $data | Where-Object {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or
($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
}
Write-Host "Restaurants to re-scrape: $($needsRescrape.Count)"
Write-Host ""
foreach ($r in $needsRescrape) {
$slug = $r.slug
Write-Host "[$slug] Looking up CDX snapshots..." -NoNewline
try {
# CDX API: find snapshots from Jan-May 2025
$cdxUrl = "https://web.archive.org/cdx/search/cdx?url=inlanderrestaurantweek.com/project/$slug/&output=text&limit=10&from=20250101&to=20250501&filter=statuscode:200&fl=timestamp"
$cdxResp = Invoke-WebRequest -Uri $cdxUrl -UseBasicParsing -TimeoutSec 30 -ErrorAction Stop
$timestamps = $cdxResp.Content -split "`n" | Where-Object { $_ -match '^\d{14}$' }
if ($timestamps.Count -eq 0) {
Write-Host " No CDX snapshots found"
continue
}
Write-Host " Found $($timestamps.Count) snapshots"
$best = $null
foreach ($ts in $timestamps) {
Write-Host " Trying $ts..." -NoNewline
try {
$pageUrl = "https://web.archive.org/web/$ts/https://inlanderrestaurantweek.com/project/$slug/"
$resp = Invoke-WebRequest -Uri $pageUrl -UseBasicParsing -TimeoutSec 45 -ErrorAction Stop
$html = $resp.Content
$parsed = Parse-RestaurantHtml $html
$c1 = $parsed.first.Count
$c2 = $parsed.second.Count
$c3 = $parsed.third.Count
Write-Host " $c1/$c2/$c3"
# Better than what we have?
$curr1 = $r.menu.courses.'First Course'.Count
$curr2 = $r.menu.courses.'Second Course'.Count
$curr3 = $r.menu.courses.'Third Course'.Count
$currTotal = $curr1 + $curr2 + $curr3
$newTotal = $c1 + $c2 + $c3
if ($newTotal -gt $currTotal -or ($c1 -ge 3 -and $c2 -ge 3 -and $c3 -ge 3)) {
$best = $parsed
if ($c1 -ge 3 -and $c2 -ge 3 -and $c3 -ge 3) { break }
}
} catch {
Write-Host " FETCH ERROR"
}
Start-Sleep -Milliseconds 400
}
if ($best) {
if ($best.first.Count -gt $r.menu.courses.'First Course'.Count -or
$best.second.Count -gt $r.menu.courses.'Second Course'.Count -or
$best.third.Count -gt $r.menu.courses.'Third Course'.Count) {
Write-Host " -> Updating with $($best.first.Count)/$($best.second.Count)/$($best.third.Count) courses"
$r.menu.courses.'First Course' = @($best.first)
$r.menu.courses.'Second Course' = @($best.second)
$r.menu.courses.'Third Course' = @($best.third)
if ($best.hours -and -not $r.menu.hours) { $r.menu.hours = $best.hours }
if ($best.phone -and -not $r.menu.phone) { $r.menu.phone = $best.phone }
}
} else {
Write-Host " -> No improvement found"
}
} catch {
Write-Host " CDX ERROR: $_"
}
Start-Sleep -Milliseconds 500
}
# ---- Final report ----
Write-Host ""
Write-Host "=== Final Status ==="
$data | Where-Object {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or
($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
} | ForEach-Object {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
Write-Host " $($_.slug): $c1/$c2/$c3"
}
$json = $data | ConvertTo-Json -Depth 10
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
Write-Host ""
Write-Host "Saved to $jsonPath"