# Test parsing on local 315cuisine HTML
$html = [System.IO.File]::ReadAllText('C:\Users\DEREKC~1.CHN\AppData\Local\Temp\test-restaurant.html', [System.Text.Encoding]::UTF8)
$areaMap = [ordered]@{
"AIRWAY HEIGHTS" = "Airway Heights"
"ATHOL" = "Athol"
"COEUR D'ALENE" = "Coeur d'Alene"
"POST FALLS" = "Post Falls"
"HAYDEN" = "Hayden"
"LIBERTY LAKE" = "Liberty Lake"
"NORTH SPOKANE" = "North Spokane"
"SOUTH SPOKANE" = "South Spokane"
"SPOKANE VALLEY" = "Spokane Valley"
"WEST SPOKANE" = "West Spokane"
"WORLEY" = "Worley"
"DOWNTOWN" = "Downtown"
}
function Get-CleanText($rawHtml) {
$t = $rawHtml -replace '<[^>]+>', ' '
$t = $t -replace '&', '&'
$t = $t -replace '<', '<'
$t = $t -replace '>', '>'
$t = $t -replace '"', '"'
$t = $t -replace ''', "'"
$t = $t -replace ' ', ' '
$t = $t -replace '\s+', ' '
$t.Trim()
}
function Extract-Dishes($courseHtml) {
$dishes = [System.Collections.ArrayList]@()
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pMatches = [regex]::Matches($courseHtml, '
]*>(.*?)
', $opts)
foreach ($pm in $pMatches) {
$pContent = $pm.Groups[1].Value
if ($pContent -notmatch '') { continue }
$nameM = [regex]::Match($pContent, '(.*?)', $opts)
if (-not $nameM.Success) { continue }
$name = Get-CleanText $nameM.Groups[1].Value
if ($name -match '^(GF|GFA|V\+?|DF|V:)$') { continue }
if ($name.Length -lt 3) { continue }
if ($name -match '^[A-Z]{1,3}:') { continue }
if ($name.Length -gt 80) { continue }
$afterBr = ''
if ($pContent -match '
(.*?)$') {
$afterBr = $matches[1]
} else {
$afterStrong = [regex]::Match($pContent, '(.*?)$', $opts)
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
}
$desc = Get-CleanText $afterBr
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
}
return ,$dishes
}
function Extract-CourseBlock($html, $courseLabel) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pattern = [regex]::Escape($courseLabel) + '.{0,300}?et_pb_text_inner">(.+?)(?=(.*?)(?=et_pb_text_inner|)', $opts)
if ($innerM.Success) { return $innerM.Groups[1].Value }
}
return ''
}
$nameM = [regex]::Match($html, '(.+?) \| Inlander')
Write-Host "Name: $($nameM.Groups[1].Value.Trim())"
$priceM = [regex]::Match($html, '\$(\d+)')
Write-Host "Price: $($priceM.Groups[1].Value)"
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z /]+?)(?:\s*|\s*<)')
$cuisine = (Get-Culture).TextInfo.ToTitleCase($cuisineM.Groups[1].Value.Trim().ToLower())
Write-Host "Cuisine: $cuisine"
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
Write-Host "Phone: $($phoneM.Value)"
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
Write-Host "Hours: $($hoursM.Value.Trim())"
$areas = @()
foreach ($aKey in $areaMap.Keys) {
if ($html.ToUpper().Contains($aKey)) { $areas += $areaMap[$aKey] }
}
Write-Host "Areas: $($areas -join ', ')"
Write-Host ""
Write-Host "--- FIRST COURSE ---"
$fc = Extract-CourseBlock $html 'First Course'
$firstCourse = Extract-Dishes $fc
foreach ($d in $firstCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
Write-Host ""
Write-Host "--- SECOND COURSE ---"
$sc = Extract-CourseBlock $html 'Second Course'
$secondCourse = Extract-Dishes $sc
foreach ($d in $secondCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
Write-Host ""
Write-Host "--- THIRD COURSE ---"
$tc = Extract-CourseBlock $html 'Third Course'
$thirdCourse = Extract-Dishes $tc
foreach ($d in $thirdCourse) { Write-Host " [$($d.name)] | $($d.desc)" }