Created the file for 2025 and started parsing out the content from archive website but was rate limited. Will need to finish in the future.
This commit is contained in:
227
scrape-2025.ps1
Normal file
227
scrape-2025.ps1
Normal file
@@ -0,0 +1,227 @@
|
||||
# scrape-2025.ps1 - Scrapes 2025 Inlander Restaurant Week menus from Wayback Machine
|
||||
# Run from local path (UNC paths block PS execution)
|
||||
|
||||
$slugs = @(
|
||||
"1898", "24taps", "315cuisine", "ambrosia", "anthonys", "arrowhead", "baba",
|
||||
"backyardpublichouse", "bangkokthai", "bardenay", "barkrescuepub", "beverlys",
|
||||
"blackpearl", "borracho", "burgerdock", "cascadia", "cedars", "centennial",
|
||||
"chaps", "chinook", "chowderhead", "clinkerdagger", "cochinito", "collectivekitchen",
|
||||
"dassteinhaus", "deleons", "deleonstexmex", "dockside", "downriver", "dryfly",
|
||||
"durkins", "east", "emrys", "feastworldkitchen", "flameandcork", "flatstick",
|
||||
"flyinggoat", "fortheloveofgod", "francaise", "ganderryegrass", "gardenparty",
|
||||
"gildedunicorn", "hang10", "heritage", "hogwash", "honey", "hulapot",
|
||||
"indiahouse", "indicana", "inlandpacifickitchen", "irongoat", "ironwoodice",
|
||||
"karma", "kasa", "kismet", "kunisthai", "latahbistro", "lebanon", "legendsoffire",
|
||||
"littledragon", "littlenoodle", "longhornbbq", "loren", "lumberbeard",
|
||||
"macdaddys", "mackenzieriver", "mammamias", "mangotree", "maryhill",
|
||||
"masselowslounge", "max", "meltingpot", "mortys", "northhill", "odohertys",
|
||||
"osprey", "outsider", "palmcourtgrill", "ponderosa", "purenorthwest",
|
||||
"purgatory", "qqsushi", "redtail", "republickitchen", "republicpi", "rut",
|
||||
"safariroom", "saranac", "satay", "sauced", "screamingyak", "seasons",
|
||||
"shawnodonnells", "shelbys", "skewers", "southhillgrill", "southperrylantern",
|
||||
"spencers", "steamplant", "steelhead", "stylus", "sweetlous", "swinglounge",
|
||||
"table13", "tavolata", "terraza", "thaibamboo", "thedambar", "titos",
|
||||
"tomatostreet", "tonysonthelake", "torratea", "truelegends", "twigs",
|
||||
"uprise", "vaqueros", "vicinopizza", "victoryburger", "vieuxcarre",
|
||||
"vineolive", "wileys"
|
||||
)
|
||||
|
||||
$areaMap = [ordered]@{
|
||||
"AIRWAY HEIGHTS" = "Airway Heights"
|
||||
"ATHOL" = "Athol"
|
||||
"COEUR D'ALENE" = "Coeur d'Alene"
|
||||
"POST FALLS" = "Post Falls"
|
||||
"HAYDEN" = "Hayden"
|
||||
"LIBERTY LAKE" = "Liberty Lake"
|
||||
"NORTH SPOKANE" = "North Spokane"
|
||||
"SOUTH SPOKANE" = "South Spokane"
|
||||
"SPOKANE VALLEY" = "Spokane Valley"
|
||||
"WEST SPOKANE" = "West Spokane"
|
||||
"WORLEY" = "Worley"
|
||||
"DOWNTOWN" = "Downtown"
|
||||
}
|
||||
|
||||
function Get-CleanText($rawHtml) {
|
||||
$t = $rawHtml -replace '<[^>]+>', ' '
|
||||
$t = $t -replace '&', '&'
|
||||
$t = $t -replace '<', '<'
|
||||
$t = $t -replace '>', '>'
|
||||
$t = $t -replace '"', '"'
|
||||
$t = $t -replace ''', "'"
|
||||
$t = $t -replace ' ', ' '
|
||||
$t = $t -replace '–', '-'
|
||||
$t = $t -replace '—', '-'
|
||||
$t = $t -replace '\s+', ' '
|
||||
$t.Trim()
|
||||
}
|
||||
|
||||
function Extract-Dishes($courseHtml) {
|
||||
$dishes = [System.Collections.ArrayList]@()
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
|
||||
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
||||
|
||||
foreach ($pm in $pMatches) {
|
||||
$pContent = $pm.Groups[1].Value
|
||||
if ($pContent -notmatch '<strong>') { continue }
|
||||
|
||||
# First <strong> = dish name
|
||||
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
||||
if (-not $nameM.Success) { continue }
|
||||
$name = Get-CleanText $nameM.Groups[1].Value
|
||||
|
||||
# Skip dietary-only names and very short strings
|
||||
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025)$') { continue }
|
||||
if ($name.Length -lt 3) { continue }
|
||||
if ($name -match '^[A-Z]{1,3}:') { continue } # skip legend lines like "GF:"
|
||||
if ($name.Length -gt 80) { continue }
|
||||
|
||||
# Description: everything after first <br/>
|
||||
$afterBr = ''
|
||||
if ($pContent -match '(?s)<br\s*/?>(.*?)$') {
|
||||
$afterBr = $matches[1]
|
||||
} else {
|
||||
$afterStrong = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
|
||||
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
|
||||
}
|
||||
|
||||
$desc = Get-CleanText $afterBr
|
||||
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
||||
}
|
||||
|
||||
return ,$dishes
|
||||
}
|
||||
|
||||
function Extract-CourseBlock($html, $courseLabel, $nextLabel) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
|
||||
# Strategy 1: find content in et_pb_text_inner after course label, before next label
|
||||
if ($nextLabel) {
|
||||
$pattern = [regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'
|
||||
$m = [regex]::Match($html, $pattern, $opts)
|
||||
if ($m.Success) { return $m.Groups[1].Value }
|
||||
}
|
||||
|
||||
# Strategy 2: find the et_pb_text_inner block immediately following the course label
|
||||
$idx = $html.IndexOf($courseLabel)
|
||||
if ($idx -ge 0) {
|
||||
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
|
||||
# Skip past the heading block and find the next text_inner content
|
||||
$innerM = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
|
||||
if ($innerM.Success) { return $innerM.Groups[1].Value }
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
$restaurants = [System.Collections.ArrayList]@()
|
||||
$total = $slugs.Count
|
||||
$i = 0
|
||||
|
||||
foreach ($slug in $slugs) {
|
||||
$i++
|
||||
Write-Host "[$i/$total] Fetching: $slug" -NoNewline
|
||||
|
||||
$url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$slug/"
|
||||
|
||||
try {
|
||||
$response = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
|
||||
# Read content as bytes then decode as UTF-8 to preserve special chars
|
||||
$bytes = $response.RawContentStream.ToArray()
|
||||
$html = [System.Text.Encoding]::UTF8.GetString($bytes)
|
||||
|
||||
# --- Name ---
|
||||
$nameM = [regex]::Match($html, '<title>(.+?) \| Inlander')
|
||||
$name = if ($nameM.Success) { $nameM.Groups[1].Value.Trim() } else { $slug }
|
||||
|
||||
# --- Price (from <strong>$45</strong> in an h1) ---
|
||||
$priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>')
|
||||
$price = if ($priceM.Success) { [int]$priceM.Groups[1].Value } else { 0 }
|
||||
|
||||
# --- Cuisine ---
|
||||
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z/ ]+?)(?:\s*</|\s*<)')
|
||||
$cuisine = ''
|
||||
if ($cuisineM.Success) {
|
||||
$c = $cuisineM.Groups[1].Value.Trim()
|
||||
$cuisine = (Get-Culture).TextInfo.ToTitleCase($c.ToLower())
|
||||
}
|
||||
|
||||
# --- Phone ---
|
||||
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
|
||||
$phone = if ($phoneM.Success) { $phoneM.Value } else { '' }
|
||||
|
||||
# --- Area(s) ---
|
||||
$areas = [System.Collections.ArrayList]@()
|
||||
$htmlUpper = $html.ToUpper()
|
||||
foreach ($aKey in $areaMap.Keys) {
|
||||
if ($htmlUpper.Contains($aKey)) {
|
||||
$null = $areas.Add($areaMap[$aKey])
|
||||
}
|
||||
}
|
||||
$areas = @($areas | Select-Object -Unique)
|
||||
if ($areas.Count -eq 0) { $areas = @('Downtown') }
|
||||
|
||||
# --- Hours ---
|
||||
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
|
||||
$hours = if ($hoursM.Success) { $hoursM.Value.Trim() } else { '' }
|
||||
|
||||
# --- Menu Courses ---
|
||||
$fc = Extract-CourseBlock $html 'First Course' 'Second Course'
|
||||
$sc = Extract-CourseBlock $html 'Second Course' 'Third Course'
|
||||
$tc = Extract-CourseBlock $html 'Third Course' $null
|
||||
|
||||
$firstCourse = Extract-Dishes $fc
|
||||
$secondCourse = Extract-Dishes $sc
|
||||
$thirdCourse = Extract-Dishes $tc
|
||||
|
||||
$fc1count = $firstCourse.Count
|
||||
$fc2count = $secondCourse.Count
|
||||
$fc3count = $thirdCourse.Count
|
||||
Write-Host " -> $name [$price] $fc1count/$fc2count/$fc3count courses"
|
||||
|
||||
$null = $restaurants.Add([PSCustomObject]@{
|
||||
name = $name
|
||||
slug = $slug
|
||||
price = $price
|
||||
areas = $areas
|
||||
cuisine = $cuisine
|
||||
url = "https://inlanderrestaurantweek.com/project/$slug/"
|
||||
menu = [PSCustomObject]@{
|
||||
hours = $hours
|
||||
phone = $phone
|
||||
courses = [PSCustomObject]@{
|
||||
'First Course' = @($firstCourse)
|
||||
'Second Course' = @($secondCourse)
|
||||
'Third Course' = @($thirdCourse)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
} catch {
|
||||
Write-Host " ERROR: $_"
|
||||
$null = $restaurants.Add([PSCustomObject]@{
|
||||
name = $slug
|
||||
slug = $slug
|
||||
price = 0
|
||||
areas = @('Downtown')
|
||||
cuisine = ''
|
||||
url = "https://inlanderrestaurantweek.com/project/$slug/"
|
||||
menu = [PSCustomObject]@{
|
||||
hours = 'FETCH_ERROR'
|
||||
phone = ''
|
||||
courses = [PSCustomObject]@{
|
||||
'First Course' = @()
|
||||
'Second Course' = @()
|
||||
'Third Course' = @()
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Start-Sleep -Milliseconds 500
|
||||
}
|
||||
|
||||
$outPath = 'C:\Users\derekc.CHNSLocal\AppData\Local\Temp\2025-restaurants.json'
|
||||
$json = $restaurants | ConvertTo-Json -Depth 10
|
||||
[System.IO.File]::WriteAllText($outPath, $json, [System.Text.Encoding]::UTF8)
|
||||
Write-Host ""
|
||||
Write-Host "Done! Saved $($restaurants.Count) restaurants to $outPath"
|
||||
Reference in New Issue
Block a user