Files
Inlander-Restaurant-Week-Pi…/fix-tavolata.ps1

124 lines
5.8 KiB
PowerShell

# fix-tavolata.ps1
# Run this after the Wayback Machine rate limit resets (wait ~30 minutes after last run)
# Recovers tavolata's Third Course using the same-block parser strategy
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
function Decode-Html($str) {
if (-not $str) { return $str }
($str -replace '&amp;','&' -replace '&#039;',"'" -replace '&quot;','"' -replace '&lt;','<' -replace '&gt;','>' -replace '&nbsp;',' ' -replace '\s+',' ').Trim()
}
function Get-CleanText($rawHtml) { Decode-Html ($rawHtml -replace '<[^>]+>', ' ') }
function Test-DietaryTag($str) { $str -match '^(GF|GFA|V\+?|DF|DFA|V:|2025|Drink|V\+A)$' }
function Get-Dish($pContent) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$bWithBrM = [regex]::Match($pContent, '(?s)<b>(.*?)<br\s*/?>', $opts)
if ($bWithBrM.Success) {
$name = Get-CleanText $bWithBrM.Groups[1].Value
if ($name.Length -ge 3 -and $name.Length -le 80 -and -not (Test-DietaryTag $name) -and $name -notmatch '^[A-Z]{1,3}:') {
return [PSCustomObject]@{ name = $name; desc = Get-CleanText ($pContent.Substring($bWithBrM.Index + $bWithBrM.Length)) }
}
}
$bM = [regex]::Match($pContent, '(?s)<b>(.*?)</b>', $opts)
if ($bM.Success) {
$namePart = Get-CleanText $bM.Groups[1].Value
if ($namePart.Length -ge 3 -and -not (Test-DietaryTag $namePart)) {
$afterB = $pContent.Substring($bM.Index + $bM.Length)
$sM2 = [regex]::Match($afterB, '(?s)^[^<]*<strong>(.*?)</strong>(.*)', $opts)
if ($sM2.Success) {
$p2 = Get-CleanText $sM2.Groups[1].Value
if (-not (Test-DietaryTag $p2) -and $p2.Length -ge 2) {
return [PSCustomObject]@{ name = "$namePart $p2".Trim(); desc = Get-CleanText $sM2.Groups[2].Value }
}
}
return [PSCustomObject]@{ name = $namePart; desc = Get-CleanText $afterB }
}
}
$sM = [regex]::Match($pContent, '(?s)<strong>(.*?)</strong>', $opts)
if ($sM.Success) {
$name = Get-CleanText $sM.Groups[1].Value
if ($name.Length -lt 3 -or $name.Length -gt 80 -or (Test-DietaryTag $name) -or $name -match '^[A-Z]{1,3}:') { return $null }
$afterBr = ''
if ($pContent -match '(?s)<br\s*/?>(.*?)$') { $afterBr = $matches[1] }
else { $am = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts); if ($am.Success) { $afterBr = $am.Groups[1].Value } }
return [PSCustomObject]@{ name = $name; desc = Get-CleanText $afterBr }
}
return $null
}
function Get-Dishes($courseHtml) {
$dishes = [System.Collections.ArrayList]@()
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
foreach ($pm in [regex]::Matches($courseHtml, '(?s)<p[^>]*>(.*?)</p>', $opts)) {
$pc = $pm.Groups[1].Value
if ($pc -notmatch '<b>|<strong>') { continue }
$d = Get-Dish $pc
if ($d -and $d.name) { $null = $dishes.Add($d) }
}
return ,$dishes
}
function Get-CourseBlock($html, $label, $nextLabel) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
if ($nextLabel) {
$m = [regex]::Match($html, ([regex]::Escape($label) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'), $opts)
if ($m.Success) { return $m.Groups[1].Value }
}
$idx = $html.IndexOf($label)
if ($idx -ge 0) {
$sub = $html.Substring($idx, [Math]::Min(8000, $html.Length - $idx))
$sameDivM = [regex]::Match($sub, '(?s)</h[123]>\s*(<p.+?)(?=</div>)', $opts)
if ($sameDivM.Success -and $sameDivM.Groups[1].Value -match '<p') { return $sameDivM.Groups[1].Value }
$im = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
if ($im.Success) { return $im.Groups[1].Value }
}
return ''
}
$r = $data | Where-Object { $_.slug -eq 'tavolata' }
Write-Host "tavolata currently: $($r.menu.courses.'First Course'.Count)/$($r.menu.courses.'Second Course'.Count)/$($r.menu.courses.'Third Course'.Count)"
$timestamps = @('20250306132630','20250401000000','20250415000000','20250501000000')
$success = $false
foreach ($ts in $timestamps) {
if ($success) { break }
Write-Host "Trying timestamp $ts..." -NoNewline
try {
$url = "https://web.archive.org/web/$ts/https://inlanderrestaurantweek.com/project/tavolata/"
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
$html = $resp.Content
if ($html -match '429 Too Many') { throw "Rate limited" }
$first = Get-Dishes (Get-CourseBlock $html 'First Course' 'Second Course')
$second = Get-Dishes (Get-CourseBlock $html 'Second Course' 'Third Course')
$third = Get-Dishes (Get-CourseBlock $html 'Third Course' $null)
Write-Host " -> $($first.Count)/$($second.Count)/$($third.Count)"
if ($third.Count -gt 0) {
if ($first.Count -gt 0) { $r.menu.courses.'First Course' = @($first) }
if ($second.Count -gt 0) { $r.menu.courses.'Second Course' = @($second) }
$r.menu.courses.'Third Course' = @($third)
Write-Host "SUCCESS! tavolata Third Course recovered." -ForegroundColor Green
$success = $true
} else {
Write-Host " Third Course still empty, trying next timestamp..."
}
} catch {
Write-Host " ERROR: $_" -ForegroundColor Red
}
Start-Sleep -Seconds 10
}
if (-not $success) {
Write-Host "Could not recover tavolata Third Course. Try again later." -ForegroundColor Yellow
} else {
$json = $data | ConvertTo-Json -Depth 10
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
Write-Host "Saved to $jsonPath"
}