167 lines
6.6 KiB
PowerShell
167 lines
6.6 KiB
PowerShell
# rescrape2-missing.ps1 - Re-fetches problematic restaurants with multiple timestamps
|
|
# Uses fixed timestamps (no CDX API) with generous delays to avoid rate limiting
|
|
|
|
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
|
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
|
|
|
|
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
|
|
|
|
function Decode-Html($str) {
|
|
if (-not $str) { return $str }
|
|
($str -replace '&','&' -replace ''',"'" -replace '"','"' -replace '<','<' -replace '>','>' -replace ' ',' ' -replace '\s+',' ').Trim()
|
|
}
|
|
|
|
function Get-CleanText($rawHtml) {
|
|
Decode-Html ($rawHtml -replace '<[^>]+>', ' ')
|
|
}
|
|
|
|
function Invoke-Dishes($courseHtml) {
|
|
$dishes = [System.Collections.ArrayList]@()
|
|
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
|
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
|
foreach ($pm in $pMatches) {
|
|
$pContent = $pm.Groups[1].Value
|
|
if ($pContent -notmatch '<strong>') { continue }
|
|
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
|
if (-not $nameM.Success) { continue }
|
|
$name = Get-CleanText $nameM.Groups[1].Value
|
|
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025|Drink)') { continue }
|
|
if ($name.Length -lt 3 -or $name.Length -gt 80) { continue }
|
|
if ($name -match '^[A-Z]{1,3}:') { continue }
|
|
$afterBr = ''
|
|
if ($pContent -match '(?s)<br\s*/?>(.*?)$') { $afterBr = $matches[1] }
|
|
else {
|
|
$am = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
|
|
if ($am.Success) { $afterBr = $am.Groups[1].Value }
|
|
}
|
|
$desc = Get-CleanText $afterBr
|
|
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
|
}
|
|
return ,$dishes
|
|
}
|
|
|
|
function Invoke-CourseBlock($html, $courseLabel, $nextLabel) {
|
|
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
|
if ($nextLabel) {
|
|
$m = [regex]::Match($html, ([regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'), $opts)
|
|
if ($m.Success) { return $m.Groups[1].Value }
|
|
}
|
|
$idx = $html.IndexOf($courseLabel)
|
|
if ($idx -ge 0) {
|
|
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
|
|
$im = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
|
|
if ($im.Success) { return $im.Groups[1].Value }
|
|
}
|
|
return ''
|
|
}
|
|
|
|
function Fetch-And-Parse($url) {
|
|
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 45 -ErrorAction Stop
|
|
$html = $resp.Content
|
|
# Check if it's a 429 page
|
|
if ($html -match '429 Too Many Requests') { throw "Rate limited" }
|
|
$fc = Invoke-CourseBlock $html 'First Course' 'Second Course'
|
|
$sc = Invoke-CourseBlock $html 'Second Course' 'Third Course'
|
|
$tc = Invoke-CourseBlock $html 'Third Course' $null
|
|
return @{
|
|
first = Invoke-Dishes $fc
|
|
second = Invoke-Dishes $sc
|
|
third = Invoke-Dishes $tc
|
|
hours = if ($html -match 'Menu served ([^<]+)') { "Menu served $($matches[1].Trim())" } else { '' }
|
|
phone = if ($html -match '\((?:208|509)\) \d{3}-\d{4}') { $matches[0] } else { '' }
|
|
total = 0
|
|
}
|
|
}
|
|
|
|
# Timestamps to try for each restaurant (spanning Feb-May 2025)
|
|
$timestamps = @(
|
|
'20250301000000',
|
|
'20250308000000',
|
|
'20250315000000',
|
|
'20250401000000',
|
|
'20250415000000',
|
|
'20250501000000'
|
|
)
|
|
|
|
# Find problematic restaurants
|
|
$problems = $data | Where-Object {
|
|
$c1 = $_.menu.courses.'First Course'.Count
|
|
$c2 = $_.menu.courses.'Second Course'.Count
|
|
$c3 = $_.menu.courses.'Third Course'.Count
|
|
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or
|
|
($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
|
|
}
|
|
|
|
Write-Host "Restaurants to retry: $($problems.Count)"
|
|
Write-Host "Starting with 3-second delay between requests..."
|
|
Write-Host ""
|
|
|
|
$i = 0
|
|
foreach ($r in $problems) {
|
|
$i++
|
|
$slug = $r.slug
|
|
$curr1 = $r.menu.courses.'First Course'.Count
|
|
$curr2 = $r.menu.courses.'Second Course'.Count
|
|
$curr3 = $r.menu.courses.'Third Course'.Count
|
|
Write-Host "[$i/$($problems.Count)] $slug (currently $curr1/$curr2/$curr3)"
|
|
|
|
$bestResult = $null
|
|
$bestTotal = $curr1 + $curr2 + $curr3
|
|
|
|
foreach ($ts in $timestamps) {
|
|
$url = "https://web.archive.org/web/$ts/https://inlanderrestaurantweek.com/project/$slug/"
|
|
Write-Host " Trying $ts..." -NoNewline
|
|
try {
|
|
$result = Fetch-And-Parse $url
|
|
$t = $result.first.Count + $result.second.Count + $result.third.Count
|
|
Write-Host " $($result.first.Count)/$($result.second.Count)/$($result.third.Count)"
|
|
if ($t -gt $bestTotal) {
|
|
$bestTotal = $t
|
|
$bestResult = $result
|
|
if ($result.first.Count -ge 3 -and $result.second.Count -ge 3 -and $result.third.Count -ge 3) {
|
|
break # Perfect - no need to try more timestamps
|
|
}
|
|
}
|
|
} catch {
|
|
Write-Host " FAIL: $_"
|
|
}
|
|
Start-Sleep -Milliseconds 3000
|
|
}
|
|
|
|
if ($bestResult -and $bestTotal -gt ($curr1 + $curr2 + $curr3)) {
|
|
Write-Host " -> Updating: $($bestResult.first.Count)/$($bestResult.second.Count)/$($bestResult.third.Count)"
|
|
$r.menu.courses.'First Course' = @($bestResult.first)
|
|
$r.menu.courses.'Second Course' = @($bestResult.second)
|
|
$r.menu.courses.'Third Course' = @($bestResult.third)
|
|
if ($bestResult.hours -and -not $r.menu.hours) { $r.menu.hours = $bestResult.hours }
|
|
if ($bestResult.phone -and -not $r.menu.phone) { $r.menu.phone = $bestResult.phone }
|
|
} else {
|
|
Write-Host " -> No improvement"
|
|
}
|
|
Start-Sleep -Milliseconds 2000
|
|
}
|
|
|
|
Write-Host ""
|
|
Write-Host "=== Final Status ==="
|
|
$remaining = $data | Where-Object {
|
|
$c1 = $_.menu.courses.'First Course'.Count
|
|
$c2 = $_.menu.courses.'Second Course'.Count
|
|
$c3 = $_.menu.courses.'Third Course'.Count
|
|
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or ($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
|
|
}
|
|
Write-Host "Still incomplete: $($remaining.Count)"
|
|
foreach ($r in $remaining) {
|
|
$c1 = $_.menu.courses.'First Course'.Count
|
|
$c2 = $_.menu.courses.'Second Course'.Count
|
|
$c3 = $_.menu.courses.'Third Course'.Count
|
|
$c1 = $r.menu.courses.'First Course'.Count
|
|
$c2 = $r.menu.courses.'Second Course'.Count
|
|
$c3 = $r.menu.courses.'Third Course'.Count
|
|
Write-Host " $($r.slug): $c1/$c2/$c3"
|
|
}
|
|
|
|
$json = $data | ConvertTo-Json -Depth 10
|
|
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
|
|
Write-Host ""
|
|
Write-Host "Saved to $jsonPath"
|