228 lines
9.0 KiB
PowerShell
228 lines
9.0 KiB
PowerShell
# scrape-2025.ps1 - Scrapes 2025 Inlander Restaurant Week menus from Wayback Machine
|
|
# Run from local path (UNC paths block PS execution)
|
|
|
|
$slugs = @(
|
|
"1898", "24taps", "315cuisine", "ambrosia", "anthonys", "arrowhead", "baba",
|
|
"backyardpublichouse", "bangkokthai", "bardenay", "barkrescuepub", "beverlys",
|
|
"blackpearl", "borracho", "burgerdock", "cascadia", "cedars", "centennial",
|
|
"chaps", "chinook", "chowderhead", "clinkerdagger", "cochinito", "collectivekitchen",
|
|
"dassteinhaus", "deleons", "deleonstexmex", "dockside", "downriver", "dryfly",
|
|
"durkins", "east", "emrys", "feastworldkitchen", "flameandcork", "flatstick",
|
|
"flyinggoat", "fortheloveofgod", "francaise", "ganderryegrass", "gardenparty",
|
|
"gildedunicorn", "hang10", "heritage", "hogwash", "honey", "hulapot",
|
|
"indiahouse", "indicana", "inlandpacifickitchen", "irongoat", "ironwoodice",
|
|
"karma", "kasa", "kismet", "kunisthai", "latahbistro", "lebanon", "legendsoffire",
|
|
"littledragon", "littlenoodle", "longhornbbq", "loren", "lumberbeard",
|
|
"macdaddys", "mackenzieriver", "mammamias", "mangotree", "maryhill",
|
|
"masselowslounge", "max", "meltingpot", "mortys", "northhill", "odohertys",
|
|
"osprey", "outsider", "palmcourtgrill", "ponderosa", "purenorthwest",
|
|
"purgatory", "qqsushi", "redtail", "republickitchen", "republicpi", "rut",
|
|
"safariroom", "saranac", "satay", "sauced", "screamingyak", "seasons",
|
|
"shawnodonnells", "shelbys", "skewers", "southhillgrill", "southperrylantern",
|
|
"spencers", "steamplant", "steelhead", "stylus", "sweetlous", "swinglounge",
|
|
"table13", "tavolata", "terraza", "thaibamboo", "thedambar", "titos",
|
|
"tomatostreet", "tonysonthelake", "torratea", "truelegends", "twigs",
|
|
"uprise", "vaqueros", "vicinopizza", "victoryburger", "vieuxcarre",
|
|
"vineolive", "wileys"
|
|
)
|
|
|
|
$areaMap = [ordered]@{
|
|
"AIRWAY HEIGHTS" = "Airway Heights"
|
|
"ATHOL" = "Athol"
|
|
"COEUR D'ALENE" = "Coeur d'Alene"
|
|
"POST FALLS" = "Post Falls"
|
|
"HAYDEN" = "Hayden"
|
|
"LIBERTY LAKE" = "Liberty Lake"
|
|
"NORTH SPOKANE" = "North Spokane"
|
|
"SOUTH SPOKANE" = "South Spokane"
|
|
"SPOKANE VALLEY" = "Spokane Valley"
|
|
"WEST SPOKANE" = "West Spokane"
|
|
"WORLEY" = "Worley"
|
|
"DOWNTOWN" = "Downtown"
|
|
}
|
|
|
|
function Get-CleanText($rawHtml) {
|
|
$t = $rawHtml -replace '<[^>]+>', ' '
|
|
$t = $t -replace '&', '&'
|
|
$t = $t -replace '<', '<'
|
|
$t = $t -replace '>', '>'
|
|
$t = $t -replace '"', '"'
|
|
$t = $t -replace ''', "'"
|
|
$t = $t -replace ' ', ' '
|
|
$t = $t -replace '–', '-'
|
|
$t = $t -replace '—', '-'
|
|
$t = $t -replace '\s+', ' '
|
|
$t.Trim()
|
|
}
|
|
|
|
function Extract-Dishes($courseHtml) {
|
|
$dishes = [System.Collections.ArrayList]@()
|
|
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
|
|
|
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
|
|
|
foreach ($pm in $pMatches) {
|
|
$pContent = $pm.Groups[1].Value
|
|
if ($pContent -notmatch '<strong>') { continue }
|
|
|
|
# First <strong> = dish name
|
|
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
|
if (-not $nameM.Success) { continue }
|
|
$name = Get-CleanText $nameM.Groups[1].Value
|
|
|
|
# Skip dietary-only names and very short strings
|
|
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025)$') { continue }
|
|
if ($name.Length -lt 3) { continue }
|
|
if ($name -match '^[A-Z]{1,3}:') { continue } # skip legend lines like "GF:"
|
|
if ($name.Length -gt 80) { continue }
|
|
|
|
# Description: everything after first <br/>
|
|
$afterBr = ''
|
|
if ($pContent -match '(?s)<br\s*/?>(.*?)$') {
|
|
$afterBr = $matches[1]
|
|
} else {
|
|
$afterStrong = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
|
|
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
|
|
}
|
|
|
|
$desc = Get-CleanText $afterBr
|
|
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
|
}
|
|
|
|
return ,$dishes
|
|
}
|
|
|
|
function Extract-CourseBlock($html, $courseLabel, $nextLabel) {
|
|
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
|
|
|
# Strategy 1: find content in et_pb_text_inner after course label, before next label
|
|
if ($nextLabel) {
|
|
$pattern = [regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'
|
|
$m = [regex]::Match($html, $pattern, $opts)
|
|
if ($m.Success) { return $m.Groups[1].Value }
|
|
}
|
|
|
|
# Strategy 2: find the et_pb_text_inner block immediately following the course label
|
|
$idx = $html.IndexOf($courseLabel)
|
|
if ($idx -ge 0) {
|
|
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
|
|
# Skip past the heading block and find the next text_inner content
|
|
$innerM = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
|
|
if ($innerM.Success) { return $innerM.Groups[1].Value }
|
|
}
|
|
return ''
|
|
}
|
|
|
|
$restaurants = [System.Collections.ArrayList]@()
|
|
$total = $slugs.Count
|
|
$i = 0
|
|
|
|
foreach ($slug in $slugs) {
|
|
$i++
|
|
Write-Host "[$i/$total] Fetching: $slug" -NoNewline
|
|
|
|
$url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$slug/"
|
|
|
|
try {
|
|
$response = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
|
|
# Read content as bytes then decode as UTF-8 to preserve special chars
|
|
$bytes = $response.RawContentStream.ToArray()
|
|
$html = [System.Text.Encoding]::UTF8.GetString($bytes)
|
|
|
|
# --- Name ---
|
|
$nameM = [regex]::Match($html, '<title>(.+?) \| Inlander')
|
|
$name = if ($nameM.Success) { $nameM.Groups[1].Value.Trim() } else { $slug }
|
|
|
|
# --- Price (from <strong>$45</strong> in an h1) ---
|
|
$priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>')
|
|
$price = if ($priceM.Success) { [int]$priceM.Groups[1].Value } else { 0 }
|
|
|
|
# --- Cuisine ---
|
|
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z/ ]+?)(?:\s*</|\s*<)')
|
|
$cuisine = ''
|
|
if ($cuisineM.Success) {
|
|
$c = $cuisineM.Groups[1].Value.Trim()
|
|
$cuisine = (Get-Culture).TextInfo.ToTitleCase($c.ToLower())
|
|
}
|
|
|
|
# --- Phone ---
|
|
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
|
|
$phone = if ($phoneM.Success) { $phoneM.Value } else { '' }
|
|
|
|
# --- Area(s) ---
|
|
$areas = [System.Collections.ArrayList]@()
|
|
$htmlUpper = $html.ToUpper()
|
|
foreach ($aKey in $areaMap.Keys) {
|
|
if ($htmlUpper.Contains($aKey)) {
|
|
$null = $areas.Add($areaMap[$aKey])
|
|
}
|
|
}
|
|
$areas = @($areas | Select-Object -Unique)
|
|
if ($areas.Count -eq 0) { $areas = @('Downtown') }
|
|
|
|
# --- Hours ---
|
|
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
|
|
$hours = if ($hoursM.Success) { $hoursM.Value.Trim() } else { '' }
|
|
|
|
# --- Menu Courses ---
|
|
$fc = Extract-CourseBlock $html 'First Course' 'Second Course'
|
|
$sc = Extract-CourseBlock $html 'Second Course' 'Third Course'
|
|
$tc = Extract-CourseBlock $html 'Third Course' $null
|
|
|
|
$firstCourse = Extract-Dishes $fc
|
|
$secondCourse = Extract-Dishes $sc
|
|
$thirdCourse = Extract-Dishes $tc
|
|
|
|
$fc1count = $firstCourse.Count
|
|
$fc2count = $secondCourse.Count
|
|
$fc3count = $thirdCourse.Count
|
|
Write-Host " -> $name [$price] $fc1count/$fc2count/$fc3count courses"
|
|
|
|
$null = $restaurants.Add([PSCustomObject]@{
|
|
name = $name
|
|
slug = $slug
|
|
price = $price
|
|
areas = $areas
|
|
cuisine = $cuisine
|
|
url = "https://inlanderrestaurantweek.com/project/$slug/"
|
|
menu = [PSCustomObject]@{
|
|
hours = $hours
|
|
phone = $phone
|
|
courses = [PSCustomObject]@{
|
|
'First Course' = @($firstCourse)
|
|
'Second Course' = @($secondCourse)
|
|
'Third Course' = @($thirdCourse)
|
|
}
|
|
}
|
|
})
|
|
|
|
} catch {
|
|
Write-Host " ERROR: $_"
|
|
$null = $restaurants.Add([PSCustomObject]@{
|
|
name = $slug
|
|
slug = $slug
|
|
price = 0
|
|
areas = @('Downtown')
|
|
cuisine = ''
|
|
url = "https://inlanderrestaurantweek.com/project/$slug/"
|
|
menu = [PSCustomObject]@{
|
|
hours = 'FETCH_ERROR'
|
|
phone = ''
|
|
courses = [PSCustomObject]@{
|
|
'First Course' = @()
|
|
'Second Course' = @()
|
|
'Third Course' = @()
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
Start-Sleep -Milliseconds 500
|
|
}
|
|
|
|
$outPath = 'C:\Users\derekc.CHNSLocal\AppData\Local\Temp\2025-restaurants.json'
|
|
$json = $restaurants | ConvertTo-Json -Depth 10
|
|
[System.IO.File]::WriteAllText($outPath, $json, [System.Text.Encoding]::UTF8)
|
|
Write-Host ""
|
|
Write-Host "Done! Saved $($restaurants.Count) restaurants to $outPath"
|