122 lines
4.5 KiB
PowerShell
122 lines
4.5 KiB
PowerShell
# fix-2025.ps1 - Post-process the scraped 2025 restaurant JSON
|
|
# Fixes: HTML entities in names/descs, wrong prices, re-fetches 0-course restaurants
|
|
|
|
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
|
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
|
|
|
|
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
|
|
|
|
function Decode-Html($str) {
|
|
if (-not $str) { return $str }
|
|
$s = $str
|
|
$s = $s -replace '&', '&'
|
|
$s = $s -replace ''', "'"
|
|
$s = $s -replace '"', '"'
|
|
$s = $s -replace '<', '<'
|
|
$s = $s -replace '>', '>'
|
|
$s = $s -replace ' ', ' '
|
|
$s = $s -replace '\s+', ' '
|
|
$s.Trim()
|
|
}
|
|
|
|
# ---- Report issues ----
|
|
Write-Host "=== Data Quality Report ==="
|
|
Write-Host "Total restaurants: $($data.Count)"
|
|
Write-Host ""
|
|
|
|
Write-Host "Wrong prices (not 25/35/45):"
|
|
$data | Where-Object { $_.price -notin @(25,35,45) } | ForEach-Object {
|
|
Write-Host " $($_.slug): price=$($_.price)"
|
|
}
|
|
|
|
Write-Host ""
|
|
Write-Host "Zero-course restaurants (all 3 empty):"
|
|
$data | Where-Object {
|
|
$_.menu.courses.'First Course'.Count -eq 0 -and
|
|
$_.menu.courses.'Second Course'.Count -eq 0 -and
|
|
$_.menu.courses.'Third Course'.Count -eq 0
|
|
} | ForEach-Object { Write-Host " $($_.slug) [price=$($_.price)] name=$($_.name)" }
|
|
|
|
Write-Host ""
|
|
Write-Host "Partial courses (any course != 3):"
|
|
$data | Where-Object {
|
|
$_.menu.courses.'First Course'.Count -ne 3 -or
|
|
$_.menu.courses.'Second Course'.Count -ne 3 -or
|
|
$_.menu.courses.'Third Course'.Count -ne 3
|
|
} | Where-Object {
|
|
# Exclude totally empty ones (already reported above)
|
|
-not (
|
|
$_.menu.courses.'First Course'.Count -eq 0 -and
|
|
$_.menu.courses.'Second Course'.Count -eq 0 -and
|
|
$_.menu.courses.'Third Course'.Count -eq 0
|
|
)
|
|
} | ForEach-Object {
|
|
$c1 = $_.menu.courses.'First Course'.Count
|
|
$c2 = $_.menu.courses.'Second Course'.Count
|
|
$c3 = $_.menu.courses.'Third Course'.Count
|
|
Write-Host " $($_.slug): $c1/$c2/$c3"
|
|
}
|
|
|
|
Write-Host ""
|
|
Write-Host "=== Applying Fixes ==="
|
|
|
|
# ---- Fix HTML entities in all string fields ----
|
|
foreach ($r in $data) {
|
|
$r.name = Decode-Html $r.name
|
|
$r.cuisine = Decode-Html $r.cuisine
|
|
|
|
foreach ($course in @('First Course', 'Second Course', 'Third Course')) {
|
|
$items = $r.menu.courses.$course
|
|
if ($items) {
|
|
foreach ($item in $items) {
|
|
$item.name = Decode-Html $item.name
|
|
$item.desc = Decode-Html $item.desc
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# ---- Fix wrong prices using the Wayback Machine price page ----
|
|
# The price page listed restaurants under $25, $35, $45 sections.
|
|
# We'll re-fetch pages for wrong-price restaurants using a tighter regex.
|
|
|
|
$wrongPrice = $data | Where-Object { $_.price -notin @(25,35,45) }
|
|
if ($wrongPrice.Count -gt 0) {
|
|
Write-Host "Re-fetching $($wrongPrice.Count) restaurants with wrong prices..."
|
|
|
|
foreach ($r in $wrongPrice) {
|
|
Write-Host " $($r.slug)..." -NoNewline
|
|
try {
|
|
$url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$($r.slug)/"
|
|
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
|
|
$html = $resp.Content
|
|
|
|
# Look specifically for h1 containing a 2-digit price at a tier
|
|
$priceM = [regex]::Match($html, '<h1[^>]*>.*?<strong>\$(25|35|45)</strong>', [System.Text.RegularExpressions.RegexOptions]::Singleline)
|
|
if ($priceM.Success) {
|
|
$r.price = [int]$priceM.Groups[1].Value
|
|
Write-Host " fixed to $($r.price)"
|
|
} else {
|
|
# Try all strong dollar values and pick first that's 25, 35, or 45
|
|
$allPrices = [regex]::Matches($html, '<strong>\$(\d+)</strong>')
|
|
$validPrice = $allPrices | Where-Object { $_.Groups[1].Value -in @('25','35','45') } | Select-Object -First 1
|
|
if ($validPrice) {
|
|
$r.price = [int]$validPrice.Groups[1].Value
|
|
Write-Host " fixed to $($r.price)"
|
|
} else {
|
|
Write-Host " could not determine - left at $($r.price)"
|
|
}
|
|
}
|
|
} catch {
|
|
Write-Host " FETCH ERROR: $_"
|
|
}
|
|
Start-Sleep -Milliseconds 300
|
|
}
|
|
}
|
|
|
|
# ---- Save fixed JSON ----
|
|
$json = $data | ConvertTo-Json -Depth 10
|
|
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
|
|
Write-Host ""
|
|
Write-Host "Saved fixed JSON to $jsonPath"
|