Setup 2025 files and started parsing the archive site but was rate limited. Will need to finish it in the future.
This commit is contained in:
123
fix-tavolata.ps1
Normal file
123
fix-tavolata.ps1
Normal file
@@ -0,0 +1,123 @@
|
||||
# fix-tavolata.ps1
|
||||
# Run this after the Wayback Machine rate limit resets (wait ~30 minutes after last run)
|
||||
# Recovers tavolata's Third Course using the same-block parser strategy
|
||||
|
||||
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
|
||||
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
|
||||
|
||||
function Decode-Html($str) {
|
||||
if (-not $str) { return $str }
|
||||
($str -replace '&','&' -replace ''',"'" -replace '"','"' -replace '<','<' -replace '>','>' -replace ' ',' ' -replace '\s+',' ').Trim()
|
||||
}
|
||||
function Get-CleanText($rawHtml) { Decode-Html ($rawHtml -replace '<[^>]+>', ' ') }
|
||||
function Test-DietaryTag($str) { $str -match '^(GF|GFA|V\+?|DF|DFA|V:|2025|Drink|V\+A)$' }
|
||||
|
||||
function Get-Dish($pContent) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
$bWithBrM = [regex]::Match($pContent, '(?s)<b>(.*?)<br\s*/?>', $opts)
|
||||
if ($bWithBrM.Success) {
|
||||
$name = Get-CleanText $bWithBrM.Groups[1].Value
|
||||
if ($name.Length -ge 3 -and $name.Length -le 80 -and -not (Test-DietaryTag $name) -and $name -notmatch '^[A-Z]{1,3}:') {
|
||||
return [PSCustomObject]@{ name = $name; desc = Get-CleanText ($pContent.Substring($bWithBrM.Index + $bWithBrM.Length)) }
|
||||
}
|
||||
}
|
||||
$bM = [regex]::Match($pContent, '(?s)<b>(.*?)</b>', $opts)
|
||||
if ($bM.Success) {
|
||||
$namePart = Get-CleanText $bM.Groups[1].Value
|
||||
if ($namePart.Length -ge 3 -and -not (Test-DietaryTag $namePart)) {
|
||||
$afterB = $pContent.Substring($bM.Index + $bM.Length)
|
||||
$sM2 = [regex]::Match($afterB, '(?s)^[^<]*<strong>(.*?)</strong>(.*)', $opts)
|
||||
if ($sM2.Success) {
|
||||
$p2 = Get-CleanText $sM2.Groups[1].Value
|
||||
if (-not (Test-DietaryTag $p2) -and $p2.Length -ge 2) {
|
||||
return [PSCustomObject]@{ name = "$namePart $p2".Trim(); desc = Get-CleanText $sM2.Groups[2].Value }
|
||||
}
|
||||
}
|
||||
return [PSCustomObject]@{ name = $namePart; desc = Get-CleanText $afterB }
|
||||
}
|
||||
}
|
||||
$sM = [regex]::Match($pContent, '(?s)<strong>(.*?)</strong>', $opts)
|
||||
if ($sM.Success) {
|
||||
$name = Get-CleanText $sM.Groups[1].Value
|
||||
if ($name.Length -lt 3 -or $name.Length -gt 80 -or (Test-DietaryTag $name) -or $name -match '^[A-Z]{1,3}:') { return $null }
|
||||
$afterBr = ''
|
||||
if ($pContent -match '(?s)<br\s*/?>(.*?)$') { $afterBr = $matches[1] }
|
||||
else { $am = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts); if ($am.Success) { $afterBr = $am.Groups[1].Value } }
|
||||
return [PSCustomObject]@{ name = $name; desc = Get-CleanText $afterBr }
|
||||
}
|
||||
return $null
|
||||
}
|
||||
|
||||
function Get-Dishes($courseHtml) {
|
||||
$dishes = [System.Collections.ArrayList]@()
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
foreach ($pm in [regex]::Matches($courseHtml, '(?s)<p[^>]*>(.*?)</p>', $opts)) {
|
||||
$pc = $pm.Groups[1].Value
|
||||
if ($pc -notmatch '<b>|<strong>') { continue }
|
||||
$d = Get-Dish $pc
|
||||
if ($d -and $d.name) { $null = $dishes.Add($d) }
|
||||
}
|
||||
return ,$dishes
|
||||
}
|
||||
|
||||
function Get-CourseBlock($html, $label, $nextLabel) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
if ($nextLabel) {
|
||||
$m = [regex]::Match($html, ([regex]::Escape($label) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'), $opts)
|
||||
if ($m.Success) { return $m.Groups[1].Value }
|
||||
}
|
||||
$idx = $html.IndexOf($label)
|
||||
if ($idx -ge 0) {
|
||||
$sub = $html.Substring($idx, [Math]::Min(8000, $html.Length - $idx))
|
||||
$sameDivM = [regex]::Match($sub, '(?s)</h[123]>\s*(<p.+?)(?=</div>)', $opts)
|
||||
if ($sameDivM.Success -and $sameDivM.Groups[1].Value -match '<p') { return $sameDivM.Groups[1].Value }
|
||||
$im = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
|
||||
if ($im.Success) { return $im.Groups[1].Value }
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
$r = $data | Where-Object { $_.slug -eq 'tavolata' }
|
||||
Write-Host "tavolata currently: $($r.menu.courses.'First Course'.Count)/$($r.menu.courses.'Second Course'.Count)/$($r.menu.courses.'Third Course'.Count)"
|
||||
|
||||
$timestamps = @('20250306132630','20250401000000','20250415000000','20250501000000')
|
||||
$success = $false
|
||||
|
||||
foreach ($ts in $timestamps) {
|
||||
if ($success) { break }
|
||||
Write-Host "Trying timestamp $ts..." -NoNewline
|
||||
try {
|
||||
$url = "https://web.archive.org/web/$ts/https://inlanderrestaurantweek.com/project/tavolata/"
|
||||
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
|
||||
$html = $resp.Content
|
||||
if ($html -match '429 Too Many') { throw "Rate limited" }
|
||||
|
||||
$first = Get-Dishes (Get-CourseBlock $html 'First Course' 'Second Course')
|
||||
$second = Get-Dishes (Get-CourseBlock $html 'Second Course' 'Third Course')
|
||||
$third = Get-Dishes (Get-CourseBlock $html 'Third Course' $null)
|
||||
|
||||
Write-Host " -> $($first.Count)/$($second.Count)/$($third.Count)"
|
||||
|
||||
if ($third.Count -gt 0) {
|
||||
if ($first.Count -gt 0) { $r.menu.courses.'First Course' = @($first) }
|
||||
if ($second.Count -gt 0) { $r.menu.courses.'Second Course' = @($second) }
|
||||
$r.menu.courses.'Third Course' = @($third)
|
||||
Write-Host "SUCCESS! tavolata Third Course recovered." -ForegroundColor Green
|
||||
$success = $true
|
||||
} else {
|
||||
Write-Host " Third Course still empty, trying next timestamp..."
|
||||
}
|
||||
} catch {
|
||||
Write-Host " ERROR: $_" -ForegroundColor Red
|
||||
}
|
||||
Start-Sleep -Seconds 10
|
||||
}
|
||||
|
||||
if (-not $success) {
|
||||
Write-Host "Could not recover tavolata Third Course. Try again later." -ForegroundColor Yellow
|
||||
} else {
|
||||
$json = $data | ConvertTo-Json -Depth 10
|
||||
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
|
||||
Write-Host "Saved to $jsonPath"
|
||||
}
|
||||
Reference in New Issue
Block a user