# rescrape2-missing.ps1 - Re-fetches problematic restaurants with multiple timestamps
# Uses fixed timestamps (no CDX API) with generous delays to avoid rate limiting
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
function Decode-Html($str) {
if (-not $str) { return $str }
($str -replace '&','&' -replace ''',"'" -replace '"','"' -replace '<','<' -replace '>','>' -replace ' ',' ' -replace '\s+',' ').Trim()
}
function Get-CleanText($rawHtml) {
Decode-Html ($rawHtml -replace '<[^>]+>', ' ')
}
function Invoke-Dishes($courseHtml) {
$dishes = [System.Collections.ArrayList]@()
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pMatches = [regex]::Matches($courseHtml, '
]*>(.*?)
', $opts)
foreach ($pm in $pMatches) {
$pContent = $pm.Groups[1].Value
if ($pContent -notmatch '') { continue }
$nameM = [regex]::Match($pContent, '(.*?)', $opts)
if (-not $nameM.Success) { continue }
$name = Get-CleanText $nameM.Groups[1].Value
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025|Drink)') { continue }
if ($name.Length -lt 3 -or $name.Length -gt 80) { continue }
if ($name -match '^[A-Z]{1,3}:') { continue }
$afterBr = ''
if ($pContent -match '(?s)
(.*?)$') { $afterBr = $matches[1] }
else {
$am = [regex]::Match($pContent, '(?s)(.*?)$', $opts)
if ($am.Success) { $afterBr = $am.Groups[1].Value }
}
$desc = Get-CleanText $afterBr
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
}
return ,$dishes
}
function Invoke-CourseBlock($html, $courseLabel, $nextLabel) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
if ($nextLabel) {
$m = [regex]::Match($html, ([regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'), $opts)
if ($m.Success) { return $m.Groups[1].Value }
}
$idx = $html.IndexOf($courseLabel)
if ($idx -ge 0) {
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
$im = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!\s*\s*\s*\s* Updating: $($bestResult.first.Count)/$($bestResult.second.Count)/$($bestResult.third.Count)"
$r.menu.courses.'First Course' = @($bestResult.first)
$r.menu.courses.'Second Course' = @($bestResult.second)
$r.menu.courses.'Third Course' = @($bestResult.third)
if ($bestResult.hours -and -not $r.menu.hours) { $r.menu.hours = $bestResult.hours }
if ($bestResult.phone -and -not $r.menu.phone) { $r.menu.phone = $bestResult.phone }
} else {
Write-Host " -> No improvement"
}
Start-Sleep -Milliseconds 2000
}
Write-Host ""
Write-Host "=== Final Status ==="
$remaining = $data | Where-Object {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or ($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
}
Write-Host "Still incomplete: $($remaining.Count)"
foreach ($r in $remaining) {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
$c1 = $r.menu.courses.'First Course'.Count
$c2 = $r.menu.courses.'Second Course'.Count
$c3 = $r.menu.courses.'Third Course'.Count
Write-Host " $($r.slug): $c1/$c2/$c3"
}
$json = $data | ConvertTo-Json -Depth 10
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
Write-Host ""
Write-Host "Saved to $jsonPath"