# fix-2025.ps1 - Post-process the scraped 2025 restaurant JSON # Fixes: HTML entities in names/descs, wrong prices, re-fetches 0-course restaurants $projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition $jsonPath = Join-Path $projectDir '2025-restaurants.json' $data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json function Decode-Html($str) { if (-not $str) { return $str } $s = $str $s = $s -replace '&', '&' $s = $s -replace ''', "'" $s = $s -replace '"', '"' $s = $s -replace '<', '<' $s = $s -replace '>', '>' $s = $s -replace ' ', ' ' $s = $s -replace '\s+', ' ' $s.Trim() } # ---- Report issues ---- Write-Host "=== Data Quality Report ===" Write-Host "Total restaurants: $($data.Count)" Write-Host "" Write-Host "Wrong prices (not 25/35/45):" $data | Where-Object { $_.price -notin @(25,35,45) } | ForEach-Object { Write-Host " $($_.slug): price=$($_.price)" } Write-Host "" Write-Host "Zero-course restaurants (all 3 empty):" $data | Where-Object { $_.menu.courses.'First Course'.Count -eq 0 -and $_.menu.courses.'Second Course'.Count -eq 0 -and $_.menu.courses.'Third Course'.Count -eq 0 } | ForEach-Object { Write-Host " $($_.slug) [price=$($_.price)] name=$($_.name)" } Write-Host "" Write-Host "Partial courses (any course != 3):" $data | Where-Object { $_.menu.courses.'First Course'.Count -ne 3 -or $_.menu.courses.'Second Course'.Count -ne 3 -or $_.menu.courses.'Third Course'.Count -ne 3 } | Where-Object { # Exclude totally empty ones (already reported above) -not ( $_.menu.courses.'First Course'.Count -eq 0 -and $_.menu.courses.'Second Course'.Count -eq 0 -and $_.menu.courses.'Third Course'.Count -eq 0 ) } | ForEach-Object { $c1 = $_.menu.courses.'First Course'.Count $c2 = $_.menu.courses.'Second Course'.Count $c3 = $_.menu.courses.'Third Course'.Count Write-Host " $($_.slug): $c1/$c2/$c3" } Write-Host "" Write-Host "=== Applying Fixes ===" # ---- Fix HTML entities in all string fields ---- foreach ($r in $data) { $r.name = Decode-Html $r.name $r.cuisine = Decode-Html $r.cuisine foreach ($course in @('First Course', 'Second Course', 'Third Course')) { $items = $r.menu.courses.$course if ($items) { foreach ($item in $items) { $item.name = Decode-Html $item.name $item.desc = Decode-Html $item.desc } } } } # ---- Fix wrong prices using the Wayback Machine price page ---- # The price page listed restaurants under $25, $35, $45 sections. # We'll re-fetch pages for wrong-price restaurants using a tighter regex. $wrongPrice = $data | Where-Object { $_.price -notin @(25,35,45) } if ($wrongPrice.Count -gt 0) { Write-Host "Re-fetching $($wrongPrice.Count) restaurants with wrong prices..." foreach ($r in $wrongPrice) { Write-Host " $($r.slug)..." -NoNewline try { $url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$($r.slug)/" $resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop $html = $resp.Content # Look specifically for h1 containing a 2-digit price at a tier $priceM = [regex]::Match($html, ']*>.*?\$(25|35|45)', [System.Text.RegularExpressions.RegexOptions]::Singleline) if ($priceM.Success) { $r.price = [int]$priceM.Groups[1].Value Write-Host " fixed to $($r.price)" } else { # Try all strong dollar values and pick first that's 25, 35, or 45 $allPrices = [regex]::Matches($html, '\$(\d+)') $validPrice = $allPrices | Where-Object { $_.Groups[1].Value -in @('25','35','45') } | Select-Object -First 1 if ($validPrice) { $r.price = [int]$validPrice.Groups[1].Value Write-Host " fixed to $($r.price)" } else { Write-Host " could not determine - left at $($r.price)" } } } catch { Write-Host " FETCH ERROR: $_" } Start-Sleep -Milliseconds 300 } } # ---- Save fixed JSON ---- $json = $data | ConvertTo-Json -Depth 10 [System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8) Write-Host "" Write-Host "Saved fixed JSON to $jsonPath"