Created the file for 2025 and started parsing out the content from archive website but was rate limited. Will need to finish in the future.

This commit is contained in:
2026-02-24 15:24:56 -08:00
parent e7252c4a74
commit 17c8270742
7 changed files with 7149 additions and 0 deletions

166
rescrape2-missing.ps1 Normal file
View File

@@ -0,0 +1,166 @@
# rescrape2-missing.ps1 - Re-fetches problematic restaurants with multiple timestamps
# Uses fixed timestamps (no CDX API) with generous delays to avoid rate limiting
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
function Decode-Html($str) {
if (-not $str) { return $str }
($str -replace '&amp;','&' -replace '&#039;',"'" -replace '&quot;','"' -replace '&lt;','<' -replace '&gt;','>' -replace '&nbsp;',' ' -replace '\s+',' ').Trim()
}
function Get-CleanText($rawHtml) {
Decode-Html ($rawHtml -replace '<[^>]+>', ' ')
}
function Invoke-Dishes($courseHtml) {
$dishes = [System.Collections.ArrayList]@()
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
foreach ($pm in $pMatches) {
$pContent = $pm.Groups[1].Value
if ($pContent -notmatch '<strong>') { continue }
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
if (-not $nameM.Success) { continue }
$name = Get-CleanText $nameM.Groups[1].Value
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025|Drink)') { continue }
if ($name.Length -lt 3 -or $name.Length -gt 80) { continue }
if ($name -match '^[A-Z]{1,3}:') { continue }
$afterBr = ''
if ($pContent -match '(?s)<br\s*/?>(.*?)$') { $afterBr = $matches[1] }
else {
$am = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
if ($am.Success) { $afterBr = $am.Groups[1].Value }
}
$desc = Get-CleanText $afterBr
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
}
return ,$dishes
}
function Invoke-CourseBlock($html, $courseLabel, $nextLabel) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
if ($nextLabel) {
$m = [regex]::Match($html, ([regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'), $opts)
if ($m.Success) { return $m.Groups[1].Value }
}
$idx = $html.IndexOf($courseLabel)
if ($idx -ge 0) {
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
$im = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
if ($im.Success) { return $im.Groups[1].Value }
}
return ''
}
function Fetch-And-Parse($url) {
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 45 -ErrorAction Stop
$html = $resp.Content
# Check if it's a 429 page
if ($html -match '429 Too Many Requests') { throw "Rate limited" }
$fc = Invoke-CourseBlock $html 'First Course' 'Second Course'
$sc = Invoke-CourseBlock $html 'Second Course' 'Third Course'
$tc = Invoke-CourseBlock $html 'Third Course' $null
return @{
first = Invoke-Dishes $fc
second = Invoke-Dishes $sc
third = Invoke-Dishes $tc
hours = if ($html -match 'Menu served ([^<]+)') { "Menu served $($matches[1].Trim())" } else { '' }
phone = if ($html -match '\((?:208|509)\) \d{3}-\d{4}') { $matches[0] } else { '' }
total = 0
}
}
# Timestamps to try for each restaurant (spanning Feb-May 2025)
$timestamps = @(
'20250301000000',
'20250308000000',
'20250315000000',
'20250401000000',
'20250415000000',
'20250501000000'
)
# Find problematic restaurants
$problems = $data | Where-Object {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or
($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
}
Write-Host "Restaurants to retry: $($problems.Count)"
Write-Host "Starting with 3-second delay between requests..."
Write-Host ""
$i = 0
foreach ($r in $problems) {
$i++
$slug = $r.slug
$curr1 = $r.menu.courses.'First Course'.Count
$curr2 = $r.menu.courses.'Second Course'.Count
$curr3 = $r.menu.courses.'Third Course'.Count
Write-Host "[$i/$($problems.Count)] $slug (currently $curr1/$curr2/$curr3)"
$bestResult = $null
$bestTotal = $curr1 + $curr2 + $curr3
foreach ($ts in $timestamps) {
$url = "https://web.archive.org/web/$ts/https://inlanderrestaurantweek.com/project/$slug/"
Write-Host " Trying $ts..." -NoNewline
try {
$result = Fetch-And-Parse $url
$t = $result.first.Count + $result.second.Count + $result.third.Count
Write-Host " $($result.first.Count)/$($result.second.Count)/$($result.third.Count)"
if ($t -gt $bestTotal) {
$bestTotal = $t
$bestResult = $result
if ($result.first.Count -ge 3 -and $result.second.Count -ge 3 -and $result.third.Count -ge 3) {
break # Perfect - no need to try more timestamps
}
}
} catch {
Write-Host " FAIL: $_"
}
Start-Sleep -Milliseconds 3000
}
if ($bestResult -and $bestTotal -gt ($curr1 + $curr2 + $curr3)) {
Write-Host " -> Updating: $($bestResult.first.Count)/$($bestResult.second.Count)/$($bestResult.third.Count)"
$r.menu.courses.'First Course' = @($bestResult.first)
$r.menu.courses.'Second Course' = @($bestResult.second)
$r.menu.courses.'Third Course' = @($bestResult.third)
if ($bestResult.hours -and -not $r.menu.hours) { $r.menu.hours = $bestResult.hours }
if ($bestResult.phone -and -not $r.menu.phone) { $r.menu.phone = $bestResult.phone }
} else {
Write-Host " -> No improvement"
}
Start-Sleep -Milliseconds 2000
}
Write-Host ""
Write-Host "=== Final Status ==="
$remaining = $data | Where-Object {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or ($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
}
Write-Host "Still incomplete: $($remaining.Count)"
foreach ($r in $remaining) {
$c1 = $_.menu.courses.'First Course'.Count
$c2 = $_.menu.courses.'Second Course'.Count
$c3 = $_.menu.courses.'Third Course'.Count
$c1 = $r.menu.courses.'First Course'.Count
$c2 = $r.menu.courses.'Second Course'.Count
$c3 = $r.menu.courses.'Third Course'.Count
Write-Host " $($r.slug): $c1/$c2/$c3"
}
$json = $data | ConvertTo-Json -Depth 10
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
Write-Host ""
Write-Host "Saved to $jsonPath"