Created the file for 2025 and started parsing out the content from archive website but was rate limited. Will need to finish in the future.
This commit is contained in:
119
test-parse.ps1
Normal file
119
test-parse.ps1
Normal file
@@ -0,0 +1,119 @@
|
||||
# Test parsing on local 315cuisine HTML
|
||||
|
||||
$html = [System.IO.File]::ReadAllText('C:\Users\DEREKC~1.CHN\AppData\Local\Temp\test-restaurant.html', [System.Text.Encoding]::UTF8)
|
||||
|
||||
$areaMap = [ordered]@{
|
||||
"AIRWAY HEIGHTS" = "Airway Heights"
|
||||
"ATHOL" = "Athol"
|
||||
"COEUR D'ALENE" = "Coeur d'Alene"
|
||||
"POST FALLS" = "Post Falls"
|
||||
"HAYDEN" = "Hayden"
|
||||
"LIBERTY LAKE" = "Liberty Lake"
|
||||
"NORTH SPOKANE" = "North Spokane"
|
||||
"SOUTH SPOKANE" = "South Spokane"
|
||||
"SPOKANE VALLEY" = "Spokane Valley"
|
||||
"WEST SPOKANE" = "West Spokane"
|
||||
"WORLEY" = "Worley"
|
||||
"DOWNTOWN" = "Downtown"
|
||||
}
|
||||
|
||||
function Get-CleanText($rawHtml) {
|
||||
$t = $rawHtml -replace '<[^>]+>', ' '
|
||||
$t = $t -replace '&', '&'
|
||||
$t = $t -replace '<', '<'
|
||||
$t = $t -replace '>', '>'
|
||||
$t = $t -replace '"', '"'
|
||||
$t = $t -replace ''', "'"
|
||||
$t = $t -replace ' ', ' '
|
||||
$t = $t -replace '\s+', ' '
|
||||
$t.Trim()
|
||||
}
|
||||
|
||||
function Extract-Dishes($courseHtml) {
|
||||
$dishes = [System.Collections.ArrayList]@()
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
|
||||
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
|
||||
|
||||
foreach ($pm in $pMatches) {
|
||||
$pContent = $pm.Groups[1].Value
|
||||
if ($pContent -notmatch '<strong>') { continue }
|
||||
|
||||
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
|
||||
if (-not $nameM.Success) { continue }
|
||||
$name = Get-CleanText $nameM.Groups[1].Value
|
||||
|
||||
if ($name -match '^(GF|GFA|V\+?|DF|V:)$') { continue }
|
||||
if ($name.Length -lt 3) { continue }
|
||||
if ($name -match '^[A-Z]{1,3}:') { continue }
|
||||
if ($name.Length -gt 80) { continue }
|
||||
|
||||
$afterBr = ''
|
||||
if ($pContent -match '<br\s*/?>(.*?)$') {
|
||||
$afterBr = $matches[1]
|
||||
} else {
|
||||
$afterStrong = [regex]::Match($pContent, '</strong>(.*?)$', $opts)
|
||||
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
|
||||
}
|
||||
|
||||
$desc = Get-CleanText $afterBr
|
||||
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
|
||||
}
|
||||
|
||||
return ,$dishes
|
||||
}
|
||||
|
||||
function Extract-CourseBlock($html, $courseLabel) {
|
||||
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
|
||||
$pattern = [regex]::Escape($courseLabel) + '.{0,300}?et_pb_text_inner">(.+?)(?=<h[123]|et_pb_text_inner"><h|V:\s*<|Vegetarian item|et_pb_row_inner_[23])'
|
||||
$m = [regex]::Match($html, $pattern, $opts)
|
||||
if ($m.Success) { return $m.Groups[1].Value }
|
||||
|
||||
$idx = $html.IndexOf($courseLabel)
|
||||
if ($idx -ge 0) {
|
||||
$sub = $html.Substring($idx, [Math]::Min(4000, $html.Length - $idx))
|
||||
$innerM = [regex]::Match($sub, 'et_pb_text_inner">(.*?)(?=et_pb_text_inner|</div></div></div>)', $opts)
|
||||
if ($innerM.Success) { return $innerM.Groups[1].Value }
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
$nameM = [regex]::Match($html, '<title>(.+?) \| Inlander')
|
||||
Write-Host "Name: $($nameM.Groups[1].Value.Trim())"
|
||||
|
||||
$priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>')
|
||||
Write-Host "Price: $($priceM.Groups[1].Value)"
|
||||
|
||||
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z /]+?)(?:\s*</|\s*<)')
|
||||
$cuisine = (Get-Culture).TextInfo.ToTitleCase($cuisineM.Groups[1].Value.Trim().ToLower())
|
||||
Write-Host "Cuisine: $cuisine"
|
||||
|
||||
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
|
||||
Write-Host "Phone: $($phoneM.Value)"
|
||||
|
||||
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
|
||||
Write-Host "Hours: $($hoursM.Value.Trim())"
|
||||
|
||||
$areas = @()
|
||||
foreach ($aKey in $areaMap.Keys) {
|
||||
if ($html.ToUpper().Contains($aKey)) { $areas += $areaMap[$aKey] }
|
||||
}
|
||||
Write-Host "Areas: $($areas -join ', ')"
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "--- FIRST COURSE ---"
|
||||
$fc = Extract-CourseBlock $html 'First Course'
|
||||
$firstCourse = Extract-Dishes $fc
|
||||
foreach ($d in $firstCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "--- SECOND COURSE ---"
|
||||
$sc = Extract-CourseBlock $html 'Second Course'
|
||||
$secondCourse = Extract-Dishes $sc
|
||||
foreach ($d in $secondCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
|
||||
|
||||
Write-Host ""
|
||||
Write-Host "--- THIRD COURSE ---"
|
||||
$tc = Extract-CourseBlock $html 'Third Course'
|
||||
$thirdCourse = Extract-Dishes $tc
|
||||
foreach ($d in $thirdCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
|
||||
Reference in New Issue
Block a user