Created the file for 2025 and started parsing out the content from archive website but was rate limited. Will need to finish in the future.

This commit is contained in:
2026-02-24 15:24:56 -08:00
parent e7252c4a74
commit 17c8270742
7 changed files with 7149 additions and 0 deletions

121
fix-2025.ps1 Normal file
View File

@@ -0,0 +1,121 @@
# fix-2025.ps1 - Post-process the scraped 2025 restaurant JSON
# Fixes: HTML entities in names/descs, wrong prices, re-fetches 0-course restaurants
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
function Decode-Html($str) {
if (-not $str) { return $str }
$s = $str
$s = $s -replace '&', '&'
$s = $s -replace ''', "'"
$s = $s -replace '"', '"'
$s = $s -replace '&lt;', '<'
$s = $s -replace '&gt;', '>'
$s = $s -replace '&nbsp;', ' '
$s = $s -replace '\s+', ' '
$s.Trim()
}
# ---- Report issues ----
Write-Host "=== Data Quality Report ==="
Write-Host "Total restaurants: $($data.Count)"
Write-Host ""
Write-Host "Wrong prices (not 25/35/45):"
$data | Where-Object { $_.price -notin @(25,35,45) } | ForEach-Object {
Write-Host " $($_.slug): price=$($_.price)"
}
Write-Host ""
Write-Host "Zero-course restaurants (all 3 empty):"
$data | Where-Object {
$_.menu.courses.'First Course'.Count -eq 0 -and
$_.menu.courses.'Second Course'.Count -eq 0 -and
$_.menu.courses.'Third Course'.Count -eq 0
} | ForEach-Object { Write-Host " $($_.slug) [price=$($_.price)] name=$($_.name)" }
Write-Host ""
Write-Host "Partial courses (any course != 3):"
$data | Where-Object {
$_.menu.courses.'First Course'.Count -ne 3 -or
$_.menu.courses.'Second Course'.Count -ne 3 -or
$_.menu.courses.'Third Course'.Count -ne 3
} | Where-Object {
# Exclude totally empty ones (already reported above)
-not (
$_.menu.courses.'First Course'.Count -eq 0 -and
$_.menu.courses.'Second Course'.Count -eq 0 -and
$_.menu.courses.'Third Course'.Count -eq 0
)
} | ForEach-Object {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
Write-Host " $($_.slug): $c1/$c2/$c3"
}
Write-Host ""
Write-Host "=== Applying Fixes ==="
# ---- Fix HTML entities in all string fields ----
foreach ($r in $data) {
$r.name = Decode-Html $r.name
$r.cuisine = Decode-Html $r.cuisine
foreach ($course in @('First Course', 'Second Course', 'Third Course')) {
$items = $r.menu.courses.$course
if ($items) {
foreach ($item in $items) {
$item.name = Decode-Html $item.name
$item.desc = Decode-Html $item.desc
}
}
}
}
# ---- Fix wrong prices using the Wayback Machine price page ----
# The price page listed restaurants under $25, $35, $45 sections.
# We'll re-fetch pages for wrong-price restaurants using a tighter regex.
$wrongPrice = $data | Where-Object { $_.price -notin @(25,35,45) }
if ($wrongPrice.Count -gt 0) {
Write-Host "Re-fetching $($wrongPrice.Count) restaurants with wrong prices..."
foreach ($r in $wrongPrice) {
Write-Host " $($r.slug)..." -NoNewline
try {
$url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$($r.slug)/"
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
$html = $resp.Content
# Look specifically for h1 containing a 2-digit price at a tier
$priceM = [regex]::Match($html, '<h1[^>]*>.*?<strong>\$(25|35|45)</strong>', [System.Text.RegularExpressions.RegexOptions]::Singleline)
if ($priceM.Success) {
$r.price = [int]$priceM.Groups[1].Value
Write-Host " fixed to $($r.price)"
} else {
# Try all strong dollar values and pick first that's 25, 35, or 45
$allPrices = [regex]::Matches($html, '<strong>\$(\d+)</strong>')
$validPrice = $allPrices | Where-Object { $_.Groups[1].Value -in @('25','35','45') } | Select-Object -First 1
if ($validPrice) {
$r.price = [int]$validPrice.Groups[1].Value
Write-Host " fixed to $($r.price)"
} else {
Write-Host " could not determine - left at $($r.price)"
}
}
} catch {
Write-Host " FETCH ERROR: $_"
}
Start-Sleep -Milliseconds 300
}
}
# ---- Save fixed JSON ----
$json = $data | ConvertTo-Json -Depth 10
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
Write-Host ""
Write-Host "Saved fixed JSON to $jsonPath"