Created the file for 2025 and started parsing out the content from archive website but was rate limited. Will need to finish in the future.
This commit is contained in:
6207
2025-restaurants.json
Normal file
6207
2025-restaurants.json
Normal file
File diff suppressed because it is too large
Load Diff
121
fix-2025.ps1
Normal file
121
fix-2025.ps1
Normal file
@@ -0,0 +1,121 @@
|
||||
# fix-2025.ps1 - Post-process the scraped 2025 restaurant JSON
|
||||
# Fixes: HTML entities in names/descs, wrong prices, re-fetches 0-course restaurants
|
||||
|
||||
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
|
||||
|
||||
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
|
||||
|
||||
function Decode-Html($str) {
|
||||
if (-not $str) { return $str }
|
||||
$s = $str
|
||||
$s = $s -replace '&', '&'
|
||||
$s = $s -replace ''', "'"
|
||||
$s = $s -replace '"', '"'
|
||||
$s = $s -replace '<', '<'
|
||||
$s = $s -replace '>', '>'
|
||||
$s = $s -replace ' ', ' '
|
||||
$s = $s -replace '\s+', ' '
|
||||
$s.Trim()
|
||||
}
|
||||
|
||||
# ---- Report issues ----
|
||||
Write-Host "=== Data Quality Report ==="
|
||||
Write-Host "Total restaurants: $($data.Count)"
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Wrong prices (not 25/35/45):"
|
||||
$data | Where-Object { $_.price -notin @(25,35,45) } | ForEach-Object {
|
||||
Write-Host " $($_.slug): price=$($_.price)"
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Zero-course restaurants (all 3 empty):"
|
||||
$data | Where-Object {
|
||||
$_.menu.courses.'First Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Second Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Third Course'.Count -eq 0
|
||||
} | ForEach-Object { Write-Host " $($_.slug) [price=$($_.price)] name=$($_.name)" }
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Partial courses (any course != 3):"
|
||||
$data | Where-Object {
|
||||
$_.menu.courses.'First Course'.Count -ne 3 -or
|
||||
$_.menu.courses.'Second Course'.Count -ne 3 -or
|
||||
$_.menu.courses.'Third Course'.Count -ne 3
|
||||
} | Where-Object {
|
||||
# Exclude totally empty ones (already reported above)
|
||||
-not (
|
||||
$_.menu.courses.'First Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Second Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Third Course'.Count -eq 0
|
||||
)
|
||||
} | ForEach-Object {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
Write-Host " $($_.slug): $c1/$c2/$c3"
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Applying Fixes ==="
|
||||
|
||||
# ---- Fix HTML entities in all string fields ----
|
||||
foreach ($r in $data) {
|
||||
$r.name = Decode-Html $r.name
|
||||
$r.cuisine = Decode-Html $r.cuisine
|
||||
|
||||
foreach ($course in @('First Course', 'Second Course', 'Third Course')) {
|
||||
$items = $r.menu.courses.$course
|
||||
if ($items) {
|
||||
foreach ($item in $items) {
|
||||
$item.name = Decode-Html $item.name
|
||||
$item.desc = Decode-Html $item.desc
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ---- Fix wrong prices using the Wayback Machine price page ----
|
||||
# The price page listed restaurants under $25, $35, $45 sections.
|
||||
# We'll re-fetch pages for wrong-price restaurants using a tighter regex.
|
||||
|
||||
$wrongPrice = $data | Where-Object { $_.price -notin @(25,35,45) }
|
||||
if ($wrongPrice.Count -gt 0) {
|
||||
Write-Host "Re-fetching $($wrongPrice.Count) restaurants with wrong prices..."
|
||||
|
||||
foreach ($r in $wrongPrice) {
|
||||
Write-Host " $($r.slug)..." -NoNewline
|
||||
try {
|
||||
$url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$($r.slug)/"
|
||||
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
|
||||
$html = $resp.Content
|
||||
|
||||
# Look specifically for h1 containing a 2-digit price at a tier
|
||||
$priceM = [regex]::Match($html, '<h1[^>]*>.*?<strong>\$(25|35|45)</strong>', [System.Text.RegularExpressions.RegexOptions]::Singleline)
|
||||
if ($priceM.Success) {
|
||||
$r.price = [int]$priceM.Groups[1].Value
|
||||
Write-Host " fixed to $($r.price)"
|
||||
} else {
|
||||
# Try all strong dollar values and pick first that's 25, 35, or 45
|
||||
$allPrices = [regex]::Matches($html, '<strong>\$(\d+)</strong>')
|
||||
$validPrice = $allPrices | Where-Object { $_.Groups[1].Value -in @('25','35','45') } | Select-Object -First 1
|
||||
if ($validPrice) {
|
||||
$r.price = [int]$validPrice.Groups[1].Value
|
||||
Write-Host " fixed to $($r.price)"
|
||||
} else {
|
||||
Write-Host " could not determine - left at $($r.price)"
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
Write-Host " FETCH ERROR: $_"
|
||||
}
|
||||
Start-Sleep -Milliseconds 300
|
||||
}
|
||||
}
|
||||
|
||||
# ---- Save fixed JSON ----
|
||||
$json = $data | ConvertTo-Json -Depth 10
|
||||
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
|
||||
Write-Host ""
|
||||
Write-Host "Saved fixed JSON to $jsonPath"
|
||||
130
fix2-2025.ps1
Normal file
130
fix2-2025.ps1
Normal file
@@ -0,0 +1,130 @@
|
||||
# fix2-2025.ps1 - Comprehensive fix for 2025 restaurant JSON
|
||||
# 1. Fix all prices using authoritative data from price listing page
|
||||
# 2. Fix HTML entities in all text fields
|
||||
# 3. Report remaining issues
|
||||
|
||||
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
|
||||
|
||||
# Load JSON
|
||||
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
|
||||
|
||||
# ---- Authoritative price map from price listing page ----
|
||||
$authPrices = @{
|
||||
"1898"="45"; "24taps"="25"; "315cuisine"="45"; "ambrosia"="45";
|
||||
"anthonys"="45"; "arrowhead"="25"; "baba"="45"; "backyardpublichouse"="35";
|
||||
"bangkokthai"="35"; "bardenay"="45"; "barkrescuepub"="25"; "beverlys"="45";
|
||||
"blackpearl"="25"; "borracho"="35"; "burgerdock"="25"; "cascadia"="25";
|
||||
"cedars"="45"; "centennial"="35"; "chaps"="45"; "chinook"="45";
|
||||
"chowderhead"="35"; "clinkerdagger"="45"; "cochinito"="25"; "collectivekitchen"="45";
|
||||
"dassteinhaus"="35"; "deleons"="25"; "deleonstexmex"="25"; "dockside"="35";
|
||||
"downriver"="45"; "dryfly"="35"; "durkins"="45"; "east"="45";
|
||||
"emrys"="25"; "feastworldkitchen"="35"; "flameandcork"="35"; "flatstick"="25";
|
||||
"flyinggoat"="25"; "fortheloveofgod"="35"; "francaise"="45"; "ganderryegrass"="35";
|
||||
"gardenparty"="35"; "gildedunicorn"="45"; "hang10"="25"; "heritage"="35";
|
||||
"hogwash"="45"; "honey"="35"; "hulapot"="35"; "indiahouse"="35";
|
||||
"indicana"="45"; "inlandpacifickitchen"="45"; "irongoat"="35"; "ironwoodice"="35";
|
||||
"karma"="35"; "kasa"="25"; "kismet"="35"; "kunisthai"="35";
|
||||
"latahbistro"="45"; "lebanon"="35"; "legendsoffire"="45"; "littledragon"="25";
|
||||
"littlenoodle"="25"; "longhornbbq"="25"; "loren"="45"; "lumberbeard"="35";
|
||||
"macdaddys"="35"; "mackenzieriver"="25"; "mammamias"="25"; "mangotree"="25";
|
||||
"maryhill"="45"; "masselowslounge"="45"; "max"="45"; "meltingpot"="45";
|
||||
"mortys"="25"; "northhill"="35"; "odohertys"="35"; "osprey"="35";
|
||||
"outsider"="45"; "palmcourtgrill"="45"; "ponderosa"="35"; "purenorthwest"="35";
|
||||
"purgatory"="45"; "qqsushi"="35"; "redtail"="35"; "republickitchen"="35";
|
||||
"republicpi"="25"; "rut"="35"; "safariroom"="45"; "saranac"="35";
|
||||
"satay"="45"; "sauced"="25"; "screamingyak"="25"; "seasons"="45";
|
||||
"shawnodonnells"="25"; "shelbys"="25"; "skewers"="25"; "southhillgrill"="45";
|
||||
"southperrylantern"="45"; "spencers"="45"; "steamplant"="35"; "steelhead"="35";
|
||||
"stylus"="35"; "sweetlous"="35"; "swinglounge"="35"; "table13"="45";
|
||||
"tavolata"="45"; "terraza"="35"; "thaibamboo"="25"; "thedambar"="45";
|
||||
"titos"="35"; "tomatostreet"="35"; "tonysonthelake"="45"; "torratea"="45";
|
||||
"truelegends"="25"; "twigs"="35"; "uprise"="25"; "vaqueros"="35";
|
||||
"vicinopizza"="25"; "victoryburger"="25"; "vieuxcarre"="35"; "vineolive"="45";
|
||||
"wileys"="45"
|
||||
}
|
||||
|
||||
function Decode-Html($str) {
|
||||
if (-not $str) { return $str }
|
||||
$s = $str
|
||||
$s = $s -replace '&', '&'
|
||||
$s = $s -replace ''', "'"
|
||||
$s = $s -replace '"', '"'
|
||||
$s = $s -replace '<', '<'
|
||||
$s = $s -replace '>', '>'
|
||||
$s = $s -replace ' ', ' '
|
||||
$s = $s -replace '\s+', ' '
|
||||
$s.Trim()
|
||||
}
|
||||
|
||||
$priceFixed = 0
|
||||
$entitiesFixed = 0
|
||||
|
||||
foreach ($r in $data) {
|
||||
# Fix price from authoritative map
|
||||
if ($authPrices.ContainsKey($r.slug)) {
|
||||
$correctPrice = [int]$authPrices[$r.slug]
|
||||
if ($r.price -ne $correctPrice) {
|
||||
Write-Host "Price fix: $($r.slug) $($r.price) -> $correctPrice"
|
||||
$r.price = $correctPrice
|
||||
$priceFixed++
|
||||
}
|
||||
}
|
||||
|
||||
# Fix HTML entities
|
||||
$oldName = $r.name
|
||||
$r.name = Decode-Html $r.name
|
||||
$r.cuisine = Decode-Html $r.cuisine
|
||||
$r.menu.hours = Decode-Html $r.menu.hours
|
||||
if ($oldName -ne $r.name) { $entitiesFixed++ }
|
||||
|
||||
foreach ($course in @('First Course', 'Second Course', 'Third Course')) {
|
||||
$items = $r.menu.courses.$course
|
||||
if ($items) {
|
||||
foreach ($item in $items) {
|
||||
$item.name = Decode-Html $item.name
|
||||
$item.desc = Decode-Html $item.desc
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Fixed prices: $priceFixed"
|
||||
Write-Host "Fixed names with entities: $entitiesFixed"
|
||||
Write-Host ""
|
||||
|
||||
# ---- Report remaining issues ----
|
||||
Write-Host "=== Remaining Issues ==="
|
||||
Write-Host ""
|
||||
|
||||
Write-Host "Zero-course restaurants (all 3 empty):"
|
||||
$data | Where-Object {
|
||||
$_.menu.courses.'First Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Second Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Third Course'.Count -eq 0
|
||||
} | ForEach-Object { Write-Host " $($_.slug) [$($_.price)] $($_.name)" }
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "Partial courses (any course count not 3):"
|
||||
$data | Where-Object {
|
||||
($_.menu.courses.'First Course'.Count -ne 3 -or
|
||||
$_.menu.courses.'Second Course'.Count -ne 3 -or
|
||||
$_.menu.courses.'Third Course'.Count -ne 3) -and
|
||||
-not (
|
||||
$_.menu.courses.'First Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Second Course'.Count -eq 0 -and
|
||||
$_.menu.courses.'Third Course'.Count -eq 0
|
||||
)
|
||||
} | ForEach-Object {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
Write-Host " $($_.slug) [$($_.price)]: $c1/$c2/$c3 - $($_.name)"
|
||||
}
|
||||
|
||||
# ---- Save ----
|
||||
$json = $data | ConvertTo-Json -Depth 10
|
||||
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
|
||||
Write-Host ""
|
||||
Write-Host "Saved to $jsonPath"
|
||||
179
rescrape-missing.ps1
Normal file
179
rescrape-missing.ps1
Normal file
@@ -0,0 +1,179 @@
|
||||
# rescrape-missing.ps1 - Re-fetches 0-course and partial restaurants
|
||||
# using CDX API to find best available Wayback Machine snapshot
|
||||
|
||||
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
|
||||
|
||||
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
|
||||
|
||||
function Decode-Html($str) {
|
||||
if (-not $str) { return $str }
|
||||
$s = $str -replace '&','&' -replace ''',"'" -replace '"','"' -replace '<','<' -replace '>','>' -replace ' ',' ' -replace '\s+',' '
|
||||
$s.Trim()
|
||||
}
|
||||
|
||||
function Get-CleanText($rawHtml) {
|
||||
$t = $rawHtml -replace '<[^>]+>', ' '
|
||||
$t = Decode-Html $t
|
||||
$t.Trim()
|
||||
}
|
||||
|
||||
function Invoke-Dishes($courseHtml) {
|
||||
$dishes = [System.Collections.ArrayList]@()
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
|
||||
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
||||
foreach ($pm in $pMatches) {
|
||||
$pContent = $pm.Groups[1].Value
|
||||
if ($pContent -notmatch '<strong>') { continue }
|
||||
|
||||
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
||||
if (-not $nameM.Success) { continue }
|
||||
$name = Get-CleanText $nameM.Groups[1].Value
|
||||
|
||||
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025|Drink)') { continue }
|
||||
if ($name.Length -lt 3 -or $name.Length -gt 80) { continue }
|
||||
if ($name -match '^[A-Z]{1,3}:') { continue }
|
||||
|
||||
$afterBr = ''
|
||||
if ($pContent -match '(?s)<br\s*/?>(.*?)$') { $afterBr = $matches[1] }
|
||||
else {
|
||||
$afterStrong = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
|
||||
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
|
||||
}
|
||||
$desc = Get-CleanText $afterBr
|
||||
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
||||
}
|
||||
return ,$dishes
|
||||
}
|
||||
|
||||
function Invoke-CourseBlock($html, $courseLabel, $nextLabel) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
if ($nextLabel) {
|
||||
$pattern = [regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'
|
||||
$m = [regex]::Match($html, $pattern, $opts)
|
||||
if ($m.Success) { return $m.Groups[1].Value }
|
||||
}
|
||||
$idx = $html.IndexOf($courseLabel)
|
||||
if ($idx -ge 0) {
|
||||
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
|
||||
$innerM = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
|
||||
if ($innerM.Success) { return $innerM.Groups[1].Value }
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
function Parse-RestaurantHtml($html) {
|
||||
$fc = Invoke-CourseBlock $html 'First Course' 'Second Course'
|
||||
$sc = Invoke-CourseBlock $html 'Second Course' 'Third Course'
|
||||
$tc = Invoke-CourseBlock $html 'Third Course' $null
|
||||
return @{
|
||||
first = Invoke-Dishes $fc
|
||||
second = Invoke-Dishes $sc
|
||||
third = Invoke-Dishes $tc
|
||||
hours = if ($html -match 'Menu served ([^<]+)') { "Menu served $($matches[1].Trim())" } else { '' }
|
||||
phone = if ($html -match '\((?:208|509)\) \d{3}-\d{4}') { $matches[0] } else { '' }
|
||||
}
|
||||
}
|
||||
|
||||
# Find which restaurants need re-scraping
|
||||
$needsRescrape = $data | Where-Object {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or
|
||||
($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
|
||||
}
|
||||
|
||||
Write-Host "Restaurants to re-scrape: $($needsRescrape.Count)"
|
||||
Write-Host ""
|
||||
|
||||
foreach ($r in $needsRescrape) {
|
||||
$slug = $r.slug
|
||||
Write-Host "[$slug] Looking up CDX snapshots..." -NoNewline
|
||||
|
||||
try {
|
||||
# CDX API: find snapshots from Jan-May 2025
|
||||
$cdxUrl = "https://web.archive.org/cdx/search/cdx?url=inlanderrestaurantweek.com/project/$slug/&output=text&limit=10&from=20250101&to=20250501&filter=statuscode:200&fl=timestamp"
|
||||
$cdxResp = Invoke-WebRequest -Uri $cdxUrl -UseBasicParsing -TimeoutSec 30 -ErrorAction Stop
|
||||
$timestamps = $cdxResp.Content -split "`n" | Where-Object { $_ -match '^\d{14}$' }
|
||||
|
||||
if ($timestamps.Count -eq 0) {
|
||||
Write-Host " No CDX snapshots found"
|
||||
continue
|
||||
}
|
||||
|
||||
Write-Host " Found $($timestamps.Count) snapshots"
|
||||
|
||||
$best = $null
|
||||
foreach ($ts in $timestamps) {
|
||||
Write-Host " Trying $ts..." -NoNewline
|
||||
try {
|
||||
$pageUrl = "https://web.archive.org/web/$ts/https://inlanderrestaurantweek.com/project/$slug/"
|
||||
$resp = Invoke-WebRequest -Uri $pageUrl -UseBasicParsing -TimeoutSec 45 -ErrorAction Stop
|
||||
$html = $resp.Content
|
||||
|
||||
$parsed = Parse-RestaurantHtml $html
|
||||
$c1 = $parsed.first.Count
|
||||
$c2 = $parsed.second.Count
|
||||
$c3 = $parsed.third.Count
|
||||
Write-Host " $c1/$c2/$c3"
|
||||
|
||||
# Better than what we have?
|
||||
$curr1 = $r.menu.courses.'First Course'.Count
|
||||
$curr2 = $r.menu.courses.'Second Course'.Count
|
||||
$curr3 = $r.menu.courses.'Third Course'.Count
|
||||
$currTotal = $curr1 + $curr2 + $curr3
|
||||
$newTotal = $c1 + $c2 + $c3
|
||||
|
||||
if ($newTotal -gt $currTotal -or ($c1 -ge 3 -and $c2 -ge 3 -and $c3 -ge 3)) {
|
||||
$best = $parsed
|
||||
if ($c1 -ge 3 -and $c2 -ge 3 -and $c3 -ge 3) { break }
|
||||
}
|
||||
} catch {
|
||||
Write-Host " FETCH ERROR"
|
||||
}
|
||||
Start-Sleep -Milliseconds 400
|
||||
}
|
||||
|
||||
if ($best) {
|
||||
if ($best.first.Count -gt $r.menu.courses.'First Course'.Count -or
|
||||
$best.second.Count -gt $r.menu.courses.'Second Course'.Count -or
|
||||
$best.third.Count -gt $r.menu.courses.'Third Course'.Count) {
|
||||
Write-Host " -> Updating with $($best.first.Count)/$($best.second.Count)/$($best.third.Count) courses"
|
||||
$r.menu.courses.'First Course' = @($best.first)
|
||||
$r.menu.courses.'Second Course' = @($best.second)
|
||||
$r.menu.courses.'Third Course' = @($best.third)
|
||||
if ($best.hours -and -not $r.menu.hours) { $r.menu.hours = $best.hours }
|
||||
if ($best.phone -and -not $r.menu.phone) { $r.menu.phone = $best.phone }
|
||||
}
|
||||
} else {
|
||||
Write-Host " -> No improvement found"
|
||||
}
|
||||
|
||||
} catch {
|
||||
Write-Host " CDX ERROR: $_"
|
||||
}
|
||||
Start-Sleep -Milliseconds 500
|
||||
}
|
||||
|
||||
# ---- Final report ----
|
||||
Write-Host ""
|
||||
Write-Host "=== Final Status ==="
|
||||
$data | Where-Object {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or
|
||||
($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
|
||||
} | ForEach-Object {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
Write-Host " $($_.slug): $c1/$c2/$c3"
|
||||
}
|
||||
|
||||
$json = $data | ConvertTo-Json -Depth 10
|
||||
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
|
||||
Write-Host ""
|
||||
Write-Host "Saved to $jsonPath"
|
||||
166
rescrape2-missing.ps1
Normal file
166
rescrape2-missing.ps1
Normal file
@@ -0,0 +1,166 @@
|
||||
# rescrape2-missing.ps1 - Re-fetches problematic restaurants with multiple timestamps
|
||||
# Uses fixed timestamps (no CDX API) with generous delays to avoid rate limiting
|
||||
|
||||
$projectDir = Split-Path -Parent $MyInvocation.MyCommand.Definition
|
||||
$jsonPath = Join-Path $projectDir '2025-restaurants.json'
|
||||
|
||||
$data = Get-Content $jsonPath -Raw -Encoding UTF8 | ConvertFrom-Json
|
||||
|
||||
function Decode-Html($str) {
|
||||
if (-not $str) { return $str }
|
||||
($str -replace '&','&' -replace ''',"'" -replace '"','"' -replace '<','<' -replace '>','>' -replace ' ',' ' -replace '\s+',' ').Trim()
|
||||
}
|
||||
|
||||
function Get-CleanText($rawHtml) {
|
||||
Decode-Html ($rawHtml -replace '<[^>]+>', ' ')
|
||||
}
|
||||
|
||||
function Invoke-Dishes($courseHtml) {
|
||||
$dishes = [System.Collections.ArrayList]@()
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
||||
foreach ($pm in $pMatches) {
|
||||
$pContent = $pm.Groups[1].Value
|
||||
if ($pContent -notmatch '<strong>') { continue }
|
||||
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
||||
if (-not $nameM.Success) { continue }
|
||||
$name = Get-CleanText $nameM.Groups[1].Value
|
||||
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025|Drink)') { continue }
|
||||
if ($name.Length -lt 3 -or $name.Length -gt 80) { continue }
|
||||
if ($name -match '^[A-Z]{1,3}:') { continue }
|
||||
$afterBr = ''
|
||||
if ($pContent -match '(?s)<br\s*/?>(.*?)$') { $afterBr = $matches[1] }
|
||||
else {
|
||||
$am = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
|
||||
if ($am.Success) { $afterBr = $am.Groups[1].Value }
|
||||
}
|
||||
$desc = Get-CleanText $afterBr
|
||||
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
||||
}
|
||||
return ,$dishes
|
||||
}
|
||||
|
||||
function Invoke-CourseBlock($html, $courseLabel, $nextLabel) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
if ($nextLabel) {
|
||||
$m = [regex]::Match($html, ([regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'), $opts)
|
||||
if ($m.Success) { return $m.Groups[1].Value }
|
||||
}
|
||||
$idx = $html.IndexOf($courseLabel)
|
||||
if ($idx -ge 0) {
|
||||
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
|
||||
$im = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
|
||||
if ($im.Success) { return $im.Groups[1].Value }
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
function Fetch-And-Parse($url) {
|
||||
$resp = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 45 -ErrorAction Stop
|
||||
$html = $resp.Content
|
||||
# Check if it's a 429 page
|
||||
if ($html -match '429 Too Many Requests') { throw "Rate limited" }
|
||||
$fc = Invoke-CourseBlock $html 'First Course' 'Second Course'
|
||||
$sc = Invoke-CourseBlock $html 'Second Course' 'Third Course'
|
||||
$tc = Invoke-CourseBlock $html 'Third Course' $null
|
||||
return @{
|
||||
first = Invoke-Dishes $fc
|
||||
second = Invoke-Dishes $sc
|
||||
third = Invoke-Dishes $tc
|
||||
hours = if ($html -match 'Menu served ([^<]+)') { "Menu served $($matches[1].Trim())" } else { '' }
|
||||
phone = if ($html -match '\((?:208|509)\) \d{3}-\d{4}') { $matches[0] } else { '' }
|
||||
total = 0
|
||||
}
|
||||
}
|
||||
|
||||
# Timestamps to try for each restaurant (spanning Feb-May 2025)
|
||||
$timestamps = @(
|
||||
'20250301000000',
|
||||
'20250308000000',
|
||||
'20250315000000',
|
||||
'20250401000000',
|
||||
'20250415000000',
|
||||
'20250501000000'
|
||||
)
|
||||
|
||||
# Find problematic restaurants
|
||||
$problems = $data | Where-Object {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or
|
||||
($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
|
||||
}
|
||||
|
||||
Write-Host "Restaurants to retry: $($problems.Count)"
|
||||
Write-Host "Starting with 3-second delay between requests..."
|
||||
Write-Host ""
|
||||
|
||||
$i = 0
|
||||
foreach ($r in $problems) {
|
||||
$i++
|
||||
$slug = $r.slug
|
||||
$curr1 = $r.menu.courses.'First Course'.Count
|
||||
$curr2 = $r.menu.courses.'Second Course'.Count
|
||||
$curr3 = $r.menu.courses.'Third Course'.Count
|
||||
Write-Host "[$i/$($problems.Count)] $slug (currently $curr1/$curr2/$curr3)"
|
||||
|
||||
$bestResult = $null
|
||||
$bestTotal = $curr1 + $curr2 + $curr3
|
||||
|
||||
foreach ($ts in $timestamps) {
|
||||
$url = "https://web.archive.org/web/$ts/https://inlanderrestaurantweek.com/project/$slug/"
|
||||
Write-Host " Trying $ts..." -NoNewline
|
||||
try {
|
||||
$result = Fetch-And-Parse $url
|
||||
$t = $result.first.Count + $result.second.Count + $result.third.Count
|
||||
Write-Host " $($result.first.Count)/$($result.second.Count)/$($result.third.Count)"
|
||||
if ($t -gt $bestTotal) {
|
||||
$bestTotal = $t
|
||||
$bestResult = $result
|
||||
if ($result.first.Count -ge 3 -and $result.second.Count -ge 3 -and $result.third.Count -ge 3) {
|
||||
break # Perfect - no need to try more timestamps
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
Write-Host " FAIL: $_"
|
||||
}
|
||||
Start-Sleep -Milliseconds 3000
|
||||
}
|
||||
|
||||
if ($bestResult -and $bestTotal -gt ($curr1 + $curr2 + $curr3)) {
|
||||
Write-Host " -> Updating: $($bestResult.first.Count)/$($bestResult.second.Count)/$($bestResult.third.Count)"
|
||||
$r.menu.courses.'First Course' = @($bestResult.first)
|
||||
$r.menu.courses.'Second Course' = @($bestResult.second)
|
||||
$r.menu.courses.'Third Course' = @($bestResult.third)
|
||||
if ($bestResult.hours -and -not $r.menu.hours) { $r.menu.hours = $bestResult.hours }
|
||||
if ($bestResult.phone -and -not $r.menu.phone) { $r.menu.phone = $bestResult.phone }
|
||||
} else {
|
||||
Write-Host " -> No improvement"
|
||||
}
|
||||
Start-Sleep -Milliseconds 2000
|
||||
}
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "=== Final Status ==="
|
||||
$remaining = $data | Where-Object {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
($c1 -eq 0 -and $c2 -eq 0 -and $c3 -eq 0) -or ($c1 -ne 3 -or $c2 -ne 3 -or $c3 -ne 3)
|
||||
}
|
||||
Write-Host "Still incomplete: $($remaining.Count)"
|
||||
foreach ($r in $remaining) {
|
||||
$c1 = $_.menu.courses.'First Course'.Count
|
||||
$c2 = $_.menu.courses.'Second Course'.Count
|
||||
$c3 = $_.menu.courses.'Third Course'.Count
|
||||
$c1 = $r.menu.courses.'First Course'.Count
|
||||
$c2 = $r.menu.courses.'Second Course'.Count
|
||||
$c3 = $r.menu.courses.'Third Course'.Count
|
||||
Write-Host " $($r.slug): $c1/$c2/$c3"
|
||||
}
|
||||
|
||||
$json = $data | ConvertTo-Json -Depth 10
|
||||
[System.IO.File]::WriteAllText($jsonPath, $json, [System.Text.Encoding]::UTF8)
|
||||
Write-Host ""
|
||||
Write-Host "Saved to $jsonPath"
|
||||
227
scrape-2025.ps1
Normal file
227
scrape-2025.ps1
Normal file
@@ -0,0 +1,227 @@
|
||||
# scrape-2025.ps1 - Scrapes 2025 Inlander Restaurant Week menus from Wayback Machine
|
||||
# Run from local path (UNC paths block PS execution)
|
||||
|
||||
$slugs = @(
|
||||
"1898", "24taps", "315cuisine", "ambrosia", "anthonys", "arrowhead", "baba",
|
||||
"backyardpublichouse", "bangkokthai", "bardenay", "barkrescuepub", "beverlys",
|
||||
"blackpearl", "borracho", "burgerdock", "cascadia", "cedars", "centennial",
|
||||
"chaps", "chinook", "chowderhead", "clinkerdagger", "cochinito", "collectivekitchen",
|
||||
"dassteinhaus", "deleons", "deleonstexmex", "dockside", "downriver", "dryfly",
|
||||
"durkins", "east", "emrys", "feastworldkitchen", "flameandcork", "flatstick",
|
||||
"flyinggoat", "fortheloveofgod", "francaise", "ganderryegrass", "gardenparty",
|
||||
"gildedunicorn", "hang10", "heritage", "hogwash", "honey", "hulapot",
|
||||
"indiahouse", "indicana", "inlandpacifickitchen", "irongoat", "ironwoodice",
|
||||
"karma", "kasa", "kismet", "kunisthai", "latahbistro", "lebanon", "legendsoffire",
|
||||
"littledragon", "littlenoodle", "longhornbbq", "loren", "lumberbeard",
|
||||
"macdaddys", "mackenzieriver", "mammamias", "mangotree", "maryhill",
|
||||
"masselowslounge", "max", "meltingpot", "mortys", "northhill", "odohertys",
|
||||
"osprey", "outsider", "palmcourtgrill", "ponderosa", "purenorthwest",
|
||||
"purgatory", "qqsushi", "redtail", "republickitchen", "republicpi", "rut",
|
||||
"safariroom", "saranac", "satay", "sauced", "screamingyak", "seasons",
|
||||
"shawnodonnells", "shelbys", "skewers", "southhillgrill", "southperrylantern",
|
||||
"spencers", "steamplant", "steelhead", "stylus", "sweetlous", "swinglounge",
|
||||
"table13", "tavolata", "terraza", "thaibamboo", "thedambar", "titos",
|
||||
"tomatostreet", "tonysonthelake", "torratea", "truelegends", "twigs",
|
||||
"uprise", "vaqueros", "vicinopizza", "victoryburger", "vieuxcarre",
|
||||
"vineolive", "wileys"
|
||||
)
|
||||
|
||||
$areaMap = [ordered]@{
|
||||
"AIRWAY HEIGHTS" = "Airway Heights"
|
||||
"ATHOL" = "Athol"
|
||||
"COEUR D'ALENE" = "Coeur d'Alene"
|
||||
"POST FALLS" = "Post Falls"
|
||||
"HAYDEN" = "Hayden"
|
||||
"LIBERTY LAKE" = "Liberty Lake"
|
||||
"NORTH SPOKANE" = "North Spokane"
|
||||
"SOUTH SPOKANE" = "South Spokane"
|
||||
"SPOKANE VALLEY" = "Spokane Valley"
|
||||
"WEST SPOKANE" = "West Spokane"
|
||||
"WORLEY" = "Worley"
|
||||
"DOWNTOWN" = "Downtown"
|
||||
}
|
||||
|
||||
function Get-CleanText($rawHtml) {
|
||||
$t = $rawHtml -replace '<[^>]+>', ' '
|
||||
$t = $t -replace '&', '&'
|
||||
$t = $t -replace '<', '<'
|
||||
$t = $t -replace '>', '>'
|
||||
$t = $t -replace '"', '"'
|
||||
$t = $t -replace ''', "'"
|
||||
$t = $t -replace ' ', ' '
|
||||
$t = $t -replace '–', '-'
|
||||
$t = $t -replace '—', '-'
|
||||
$t = $t -replace '\s+', ' '
|
||||
$t.Trim()
|
||||
}
|
||||
|
||||
function Extract-Dishes($courseHtml) {
|
||||
$dishes = [System.Collections.ArrayList]@()
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
|
||||
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
||||
|
||||
foreach ($pm in $pMatches) {
|
||||
$pContent = $pm.Groups[1].Value
|
||||
if ($pContent -notmatch '<strong>') { continue }
|
||||
|
||||
# First <strong> = dish name
|
||||
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
||||
if (-not $nameM.Success) { continue }
|
||||
$name = Get-CleanText $nameM.Groups[1].Value
|
||||
|
||||
# Skip dietary-only names and very short strings
|
||||
if ($name -match '^(GF|GFA|V\+?|DF|V:|2025)$') { continue }
|
||||
if ($name.Length -lt 3) { continue }
|
||||
if ($name -match '^[A-Z]{1,3}:') { continue } # skip legend lines like "GF:"
|
||||
if ($name.Length -gt 80) { continue }
|
||||
|
||||
# Description: everything after first <br/>
|
||||
$afterBr = ''
|
||||
if ($pContent -match '(?s)<br\s*/?>(.*?)$') {
|
||||
$afterBr = $matches[1]
|
||||
} else {
|
||||
$afterStrong = [regex]::Match($pContent, '(?s)</strong>(.*?)$', $opts)
|
||||
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
|
||||
}
|
||||
|
||||
$desc = Get-CleanText $afterBr
|
||||
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
||||
}
|
||||
|
||||
return ,$dishes
|
||||
}
|
||||
|
||||
function Extract-CourseBlock($html, $courseLabel, $nextLabel) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
|
||||
# Strategy 1: find content in et_pb_text_inner after course label, before next label
|
||||
if ($nextLabel) {
|
||||
$pattern = [regex]::Escape($courseLabel) + '(.+?)(?=' + [regex]::Escape($nextLabel) + ')'
|
||||
$m = [regex]::Match($html, $pattern, $opts)
|
||||
if ($m.Success) { return $m.Groups[1].Value }
|
||||
}
|
||||
|
||||
# Strategy 2: find the et_pb_text_inner block immediately following the course label
|
||||
$idx = $html.IndexOf($courseLabel)
|
||||
if ($idx -ge 0) {
|
||||
$sub = $html.Substring($idx, [Math]::Min(6000, $html.Length - $idx))
|
||||
# Skip past the heading block and find the next text_inner content
|
||||
$innerM = [regex]::Match($sub, '(?s)et_pb_text_inner">(?!<h[123])(.+?)(?=et_pb_text_inner"><h|</div>\s*</div>\s*</div>\s*</div>\s*<div)', $opts)
|
||||
if ($innerM.Success) { return $innerM.Groups[1].Value }
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
$restaurants = [System.Collections.ArrayList]@()
|
||||
$total = $slugs.Count
|
||||
$i = 0
|
||||
|
||||
foreach ($slug in $slugs) {
|
||||
$i++
|
||||
Write-Host "[$i/$total] Fetching: $slug" -NoNewline
|
||||
|
||||
$url = "https://web.archive.org/web/20250306132630/https://inlanderrestaurantweek.com/project/$slug/"
|
||||
|
||||
try {
|
||||
$response = Invoke-WebRequest -Uri $url -UseBasicParsing -TimeoutSec 60 -ErrorAction Stop
|
||||
# Read content as bytes then decode as UTF-8 to preserve special chars
|
||||
$bytes = $response.RawContentStream.ToArray()
|
||||
$html = [System.Text.Encoding]::UTF8.GetString($bytes)
|
||||
|
||||
# --- Name ---
|
||||
$nameM = [regex]::Match($html, '<title>(.+?) \| Inlander')
|
||||
$name = if ($nameM.Success) { $nameM.Groups[1].Value.Trim() } else { $slug }
|
||||
|
||||
# --- Price (from <strong>$45</strong> in an h1) ---
|
||||
$priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>')
|
||||
$price = if ($priceM.Success) { [int]$priceM.Groups[1].Value } else { 0 }
|
||||
|
||||
# --- Cuisine ---
|
||||
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z/ ]+?)(?:\s*</|\s*<)')
|
||||
$cuisine = ''
|
||||
if ($cuisineM.Success) {
|
||||
$c = $cuisineM.Groups[1].Value.Trim()
|
||||
$cuisine = (Get-Culture).TextInfo.ToTitleCase($c.ToLower())
|
||||
}
|
||||
|
||||
# --- Phone ---
|
||||
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
|
||||
$phone = if ($phoneM.Success) { $phoneM.Value } else { '' }
|
||||
|
||||
# --- Area(s) ---
|
||||
$areas = [System.Collections.ArrayList]@()
|
||||
$htmlUpper = $html.ToUpper()
|
||||
foreach ($aKey in $areaMap.Keys) {
|
||||
if ($htmlUpper.Contains($aKey)) {
|
||||
$null = $areas.Add($areaMap[$aKey])
|
||||
}
|
||||
}
|
||||
$areas = @($areas | Select-Object -Unique)
|
||||
if ($areas.Count -eq 0) { $areas = @('Downtown') }
|
||||
|
||||
# --- Hours ---
|
||||
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
|
||||
$hours = if ($hoursM.Success) { $hoursM.Value.Trim() } else { '' }
|
||||
|
||||
# --- Menu Courses ---
|
||||
$fc = Extract-CourseBlock $html 'First Course' 'Second Course'
|
||||
$sc = Extract-CourseBlock $html 'Second Course' 'Third Course'
|
||||
$tc = Extract-CourseBlock $html 'Third Course' $null
|
||||
|
||||
$firstCourse = Extract-Dishes $fc
|
||||
$secondCourse = Extract-Dishes $sc
|
||||
$thirdCourse = Extract-Dishes $tc
|
||||
|
||||
$fc1count = $firstCourse.Count
|
||||
$fc2count = $secondCourse.Count
|
||||
$fc3count = $thirdCourse.Count
|
||||
Write-Host " -> $name [$price] $fc1count/$fc2count/$fc3count courses"
|
||||
|
||||
$null = $restaurants.Add([PSCustomObject]@{
|
||||
name = $name
|
||||
slug = $slug
|
||||
price = $price
|
||||
areas = $areas
|
||||
cuisine = $cuisine
|
||||
url = "https://inlanderrestaurantweek.com/project/$slug/"
|
||||
menu = [PSCustomObject]@{
|
||||
hours = $hours
|
||||
phone = $phone
|
||||
courses = [PSCustomObject]@{
|
||||
'First Course' = @($firstCourse)
|
||||
'Second Course' = @($secondCourse)
|
||||
'Third Course' = @($thirdCourse)
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
} catch {
|
||||
Write-Host " ERROR: $_"
|
||||
$null = $restaurants.Add([PSCustomObject]@{
|
||||
name = $slug
|
||||
slug = $slug
|
||||
price = 0
|
||||
areas = @('Downtown')
|
||||
cuisine = ''
|
||||
url = "https://inlanderrestaurantweek.com/project/$slug/"
|
||||
menu = [PSCustomObject]@{
|
||||
hours = 'FETCH_ERROR'
|
||||
phone = ''
|
||||
courses = [PSCustomObject]@{
|
||||
'First Course' = @()
|
||||
'Second Course' = @()
|
||||
'Third Course' = @()
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Start-Sleep -Milliseconds 500
|
||||
}
|
||||
|
||||
$outPath = 'C:\Users\derekc.CHNSLocal\AppData\Local\Temp\2025-restaurants.json'
|
||||
$json = $restaurants | ConvertTo-Json -Depth 10
|
||||
[System.IO.File]::WriteAllText($outPath, $json, [System.Text.Encoding]::UTF8)
|
||||
Write-Host ""
|
||||
Write-Host "Done! Saved $($restaurants.Count) restaurants to $outPath"
|
||||
119
test-parse.ps1
Normal file
119
test-parse.ps1
Normal file
@@ -0,0 +1,119 @@
|
||||
# Test parsing on local 315cuisine HTML
|
||||
|
||||
$html = [System.IO.File]::ReadAllText('C:\Users\DEREKC~1.CHN\AppData\Local\Temp\test-restaurant.html', [System.Text.Encoding]::UTF8)
|
||||
|
||||
$areaMap = [ordered]@{
|
||||
"AIRWAY HEIGHTS" = "Airway Heights"
|
||||
"ATHOL" = "Athol"
|
||||
"COEUR D'ALENE" = "Coeur d'Alene"
|
||||
"POST FALLS" = "Post Falls"
|
||||
"HAYDEN" = "Hayden"
|
||||
"LIBERTY LAKE" = "Liberty Lake"
|
||||
"NORTH SPOKANE" = "North Spokane"
|
||||
"SOUTH SPOKANE" = "South Spokane"
|
||||
"SPOKANE VALLEY" = "Spokane Valley"
|
||||
"WEST SPOKANE" = "West Spokane"
|
||||
"WORLEY" = "Worley"
|
||||
"DOWNTOWN" = "Downtown"
|
||||
}
|
||||
|
||||
function Get-CleanText($rawHtml) {
|
||||
$t = $rawHtml -replace '<[^>]+>', ' '
|
||||
$t = $t -replace '&', '&'
|
||||
$t = $t -replace '<', '<'
|
||||
$t = $t -replace '>', '>'
|
||||
$t = $t -replace '"', '"'
|
||||
$t = $t -replace ''', "'"
|
||||
$t = $t -replace ' ', ' '
|
||||
$t = $t -replace '\s+', ' '
|
||||
$t.Trim()
|
||||
}
|
||||
|
||||
function Extract-Dishes($courseHtml) {
|
||||
$dishes = [System.Collections.ArrayList]@()
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
|
||||
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
||||
|
||||
foreach ($pm in $pMatches) {
|
||||
$pContent = $pm.Groups[1].Value
|
||||
if ($pContent -notmatch '<strong>') { continue }
|
||||
|
||||
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
||||
if (-not $nameM.Success) { continue }
|
||||
$name = Get-CleanText $nameM.Groups[1].Value
|
||||
|
||||
if ($name -match '^(GF|GFA|V\+?|DF|V:)$') { continue }
|
||||
if ($name.Length -lt 3) { continue }
|
||||
if ($name -match '^[A-Z]{1,3}:') { continue }
|
||||
if ($name.Length -gt 80) { continue }
|
||||
|
||||
$afterBr = ''
|
||||
if ($pContent -match '<br\s*/?>(.*?)$') {
|
||||
$afterBr = $matches[1]
|
||||
} else {
|
||||
$afterStrong = [regex]::Match($pContent, '</strong>(.*?)$', $opts)
|
||||
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
|
||||
}
|
||||
|
||||
$desc = Get-CleanText $afterBr
|
||||
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
||||
}
|
||||
|
||||
return ,$dishes
|
||||
}
|
||||
|
||||
function Extract-CourseBlock($html, $courseLabel) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
$pattern = [regex]::Escape($courseLabel) + '.{0,300}?et_pb_text_inner">(.+?)(?=<h[123]|et_pb_text_inner"><h|V:\s*<|Vegetarian item|et_pb_row_inner_[23])'
|
||||
$m = [regex]::Match($html, $pattern, $opts)
|
||||
if ($m.Success) { return $m.Groups[1].Value }
|
||||
|
||||
$idx = $html.IndexOf($courseLabel)
|
||||
if ($idx -ge 0) {
|
||||
$sub = $html.Substring($idx, [Math]::Min(4000, $html.Length - $idx))
|
||||
$innerM = [regex]::Match($sub, 'et_pb_text_inner">(.*?)(?=et_pb_text_inner|</div></div></div>)', $opts)
|
||||
if ($innerM.Success) { return $innerM.Groups[1].Value }
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
$nameM = [regex]::Match($html, '<title>(.+?) \| Inlander')
|
||||
Write-Host "Name: $($nameM.Groups[1].Value.Trim())"
|
||||
|
||||
$priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>')
|
||||
Write-Host "Price: $($priceM.Groups[1].Value)"
|
||||
|
||||
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z /]+?)(?:\s*</|\s*<)')
|
||||
$cuisine = (Get-Culture).TextInfo.ToTitleCase($cuisineM.Groups[1].Value.Trim().ToLower())
|
||||
Write-Host "Cuisine: $cuisine"
|
||||
|
||||
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
|
||||
Write-Host "Phone: $($phoneM.Value)"
|
||||
|
||||
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
|
||||
Write-Host "Hours: $($hoursM.Value.Trim())"
|
||||
|
||||
$areas = @()
|
||||
foreach ($aKey in $areaMap.Keys) {
|
||||
if ($html.ToUpper().Contains($aKey)) { $areas += $areaMap[$aKey] }
|
||||
}
|
||||
Write-Host "Areas: $($areas -join ', ')"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "--- FIRST COURSE ---"
|
||||
$fc = Extract-CourseBlock $html 'First Course'
|
||||
$firstCourse = Extract-Dishes $fc
|
||||
foreach ($d in $firstCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "--- SECOND COURSE ---"
|
||||
$sc = Extract-CourseBlock $html 'Second Course'
|
||||
$secondCourse = Extract-Dishes $sc
|
||||
foreach ($d in $secondCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "--- THIRD COURSE ---"
|
||||
$tc = Extract-CourseBlock $html 'Third Course'
|
||||
$thirdCourse = Extract-Dishes $tc
|
||||
foreach ($d in $thirdCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
|
||||
Reference in New Issue
Block a user