Files
Inlander-Restaurant-Week-Pi…/scrape-2025.ps1

228 lines
9.0 KiB
PowerShell

# scrape-2025.ps1 - Scrapes 2025 Inlander Restaurant Week menus from Wayback Machine
# Run from local path (UNC paths block PS execution)
$slugs = @(
"1898", "24taps", "315cuisine", "ambrosia", "anthonys", "arrowhead", "baba",
"backyardpublichouse", "bangkokthai", "bardenay", "barkrescuepub", "beverlys",
"blackpearl", "borracho", "burgerdock", "cascadia", "cedars", "centennial",
"chaps", "chinook", "chowderhead", "clinkerdagger", "cochinito", "collectivekitchen",
"dassteinhaus", "deleons", "deleonstexmex", "dockside", "downriver", "dryfly",
"durkins", "east", "emrys", "feastworldkitchen", "flameandcork", "flatstick",
"flyinggoat", "fortheloveofgod", "francaise", "ganderryegrass", "gardenparty",
"gildedunicorn", "hang10", "heritage", "hogwash", "honey", "hulapot",
"indiahouse", "indicana", "inlandpacifickitchen", "irongoat", "ironwoodice",
"karma", "kasa", "kismet", "kunisthai", "latahbistro", "lebanon", "legendsoffire",
"littledragon", "littlenoodle", "longhornbbq", "loren", "lumberbeard",
"macdaddys", "mackenzieriver", "mammamias", "mangotree", "maryhill",
"masselowslounge", "max", "meltingpot", "mortys", "northhill", "odohertys",
"osprey", "outsider", "palmcourtgrill", "ponderosa", "purenorthwest",
"purgatory", "qqsushi", "redtail", "republickitchen", "republicpi", "rut",
"safariroom", "saranac", "satay", "sauced", "screamingyak", "seasons",
"shawnodonnells", "shelbys", "skewers", "southhillgrill", "southperrylantern",
"spencers", "steamplant", "steelhead", "stylus", "sweetlous", "swinglounge",
"table13", "tavolata", "terraza", "thaibamboo", "thedambar", "titos",
"tomatostreet", "tonysonthelake", "torratea", "truelegends", "twigs",
"uprise", "vaqueros", "vicinopizza", "victoryburger", "vieuxcarre",
"vineolive", "wileys"
)
$areaMap = [ordered]@{
"AIRWAY HEIGHTS" = "Airway Heights"
"ATHOL" = "Athol"
"COEUR D'ALENE" = "Coeur d'Alene"
"POST FALLS" = "Post Falls"
"HAYDEN" = "Hayden"
"LIBERTY LAKE" = "Liberty Lake"
"NORTH SPOKANE" = "North Spokane"
"SOUTH SPOKANE" = "South Spokane"
"SPOKANE VALLEY" = "Spokane Valley"
"WEST SPOKANE" = "West Spokane"
"WORLEY" = "Worley"
"DOWNTOWN" = "Downtown"
}
function Get-CleanText($rawHtml) {
$t = $rawHtml -replace '<[^>]+>', ' '
$t = $t -replace '&amp;', '&'
$t = $t -replace '&lt;', '<'
$t = $t -replace '&gt;', '>'
$t = $t -replace '&quot;', '"'
$t = $t -replace '&#039;', "'"
$t = $t -replace '&nbsp;', ' '
$t = $t -replace '&#8211;', '-'
$t = $t -replace '&#8212;', '-'
$t = $t -replace '\s+', ' '
$t.Trim()
}
function Extract-Dishes($courseHtml) {
$dishes = [System.Collections.ArrayList]@()
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
foreach ($pm in $pMatches) {
$pContent = $pm.Groups[1].Value
if ($pContent -notmatch '<strong>') { continue }
# First <strong> = dish name
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
if (-not $nameM.Success) { continue }
$name = Get-CleanText $nameM.Groups[1].Value
# Skip dietary-only names and very short strings
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025)$') { continue }
if ($name.Length -lt 3) { continue }
if ($name -match '^[A-Z]{1,3}:') { continue } # skip legend lines like "GF:"
if ($name.Length -gt 80) { continue }
# Description: everything after first <br/>
$afterBr = ''
if ($pContent -match '(?s)<br\s*/?>(.*?)$') {
$afterBr = $matches[1]
} else {
$afterStrong = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
}
$desc = Get-CleanText $afterBr
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
}
return ,$dishes
}
function Extract-CourseBlock($html, $courseLabel, $nextLabel) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
# Strategy 1: find content in et_pb_text_inner after course label, before next label
if ($nextLabel) {
$pattern = [regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'
$m = [regex]::Match($html, $pattern, $opts)
if ($m.Success) { return $m.Groups[1].Value }
}
# Strategy 2: find the et_pb_text_inner block immediately following the course label
$idx = $html.IndexOf($courseLabel)
if ($idx -ge 0) {
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
# Skip past the heading block and find the next text_inner content
$innerM = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
if ($innerM.Success) { return $innerM.Groups[1].Value }
}
return ''
}
$restaurants = [System.Collections.ArrayList]@()
$total = $slugs.Count
$i = 0
foreach ($slug in $slugs) {
$i++
Write-Host "[$i/$total] Fetching: $slug" -NoNewline
$url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$slug/"
try {
$response = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
# Read content as bytes then decode as UTF-8 to preserve special chars
$bytes = $response.RawContentStream.ToArray()
$html = [System.Text.Encoding]::UTF8.GetString($bytes)
# --- Name ---
$nameM = [regex]::Match($html, '<title>(.+?) \| Inlander')
$name = if ($nameM.Success) { $nameM.Groups[1].Value.Trim() } else { $slug }
# --- Price (from <strong>$45</strong> in an h1) ---
$priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>')
$price = if ($priceM.Success) { [int]$priceM.Groups[1].Value } else { 0 }
# --- Cuisine ---
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z/ ]+?)(?:\s*</|\s*<)')
$cuisine = ''
if ($cuisineM.Success) {
$c = $cuisineM.Groups[1].Value.Trim()
$cuisine = (Get-Culture).TextInfo.ToTitleCase($c.ToLower())
}
# --- Phone ---
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
$phone = if ($phoneM.Success) { $phoneM.Value } else { '' }
# --- Area(s) ---
$areas = [System.Collections.ArrayList]@()
$htmlUpper = $html.ToUpper()
foreach ($aKey in $areaMap.Keys) {
if ($htmlUpper.Contains($aKey)) {
$null = $areas.Add($areaMap[$aKey])
}
}
$areas = @($areas | Select-Object -Unique)
if ($areas.Count -eq 0) { $areas = @('Downtown') }
# --- Hours ---
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
$hours = if ($hoursM.Success) { $hoursM.Value.Trim() } else { '' }
# --- Menu Courses ---
$fc = Extract-CourseBlock $html 'First Course' 'Second Course'
$sc = Extract-CourseBlock $html 'Second Course' 'Third Course'
$tc = Extract-CourseBlock $html 'Third Course' $null
$firstCourse = Extract-Dishes $fc
$secondCourse = Extract-Dishes $sc
$thirdCourse = Extract-Dishes $tc
$fc1count = $firstCourse.Count
$fc2count = $secondCourse.Count
$fc3count = $thirdCourse.Count
Write-Host " -> $name [$price] $fc1count/$fc2count/$fc3count courses"
$null = $restaurants.Add([PSCustomObject]@{
name = $name
slug = $slug
price = $price
areas = $areas
cuisine = $cuisine
url = "https://inlanderrestaurantweek.com/project/$slug/"
menu = [PSCustomObject]@{
hours = $hours
phone = $phone
courses = [PSCustomObject]@{
'First Course' = @($firstCourse)
'Second Course' = @($secondCourse)
'Third Course' = @($thirdCourse)
}
}
})
} catch {
Write-Host " ERROR: $_"
$null = $restaurants.Add([PSCustomObject]@{
name = $slug
slug = $slug
price = 0
areas = @('Downtown')
cuisine = ''
url = "https://inlanderrestaurantweek.com/project/$slug/"
menu = [PSCustomObject]@{
hours = 'FETCH_ERROR'
phone = ''
courses = [PSCustomObject]@{
'First Course' = @()
'Second Course' = @()
'Third Course' = @()
}
}
})
}
Start-Sleep -Milliseconds 500
}
$outPath = 'C:\Users\derekc.CHNSLocal\AppData\Local\Temp\2025-restaurants.json'
$json = $restaurants | ConvertTo-Json -Depth 10
[System.IO.File]::WriteAllText($outPath, $json, [System.Text.Encoding]::UTF8)
Write-Host ""
Write-Host "Done! Saved $($restaurants.Count) restaurants to $outPath"