Files
Inlander-Restaurant-Week-Pi…/test-parse.ps1

120 lines
4.2 KiB
PowerShell

# Test parsing on local 315cuisine HTML
$html = [System.IO.File]::ReadAllText('C:\Users\DEREKC~1.CHN\AppData\Local\Temp\test-restaurant.html', [System.Text.Encoding]::UTF8)
$areaMap = [ordered]@{
"AIRWAY HEIGHTS" = "Airway Heights"
"ATHOL" = "Athol"
"COEUR D'ALENE" = "Coeur d'Alene"
"POST FALLS" = "Post Falls"
"HAYDEN" = "Hayden"
"LIBERTY LAKE" = "Liberty Lake"
"NORTH SPOKANE" = "North Spokane"
"SOUTH SPOKANE" = "South Spokane"
"SPOKANE VALLEY" = "Spokane Valley"
"WEST SPOKANE" = "West Spokane"
"WORLEY" = "Worley"
"DOWNTOWN" = "Downtown"
}
function Get-CleanText($rawHtml) {
$t = $rawHtml -replace '<[^>]+>', ' '
$t = $t -replace '&amp;', '&'
$t = $t -replace '&lt;', '<'
$t = $t -replace '&gt;', '>'
$t = $t -replace '&quot;', '"'
$t = $t -replace '&#039;', "'"
$t = $t -replace '&nbsp;', ' '
$t = $t -replace '\s+', ' '
$t.Trim()
}
function Extract-Dishes($courseHtml) {
$dishes = [System.Collections.ArrayList]@()
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pMatches = [regex]::Matches($courseHtml, '<p[^>]*>(.*?)</p>', $opts)
foreach ($pm in $pMatches) {
$pContent = $pm.Groups[1].Value
if ($pContent -notmatch '<strong>') { continue }
$nameM = [regex]::Match($pContent, '<strong>(.*?)</strong>', $opts)
if (-not $nameM.Success) { continue }
$name = Get-CleanText $nameM.Groups[1].Value
if ($name -match '^(GF|GFA|V\+?|DF|V:)$') { continue }
if ($name.Length -lt 3) { continue }
if ($name -match '^[A-Z]{1,3}:') { continue }
if ($name.Length -gt 80) { continue }
$afterBr = ''
if ($pContent -match '<br\s*/?>(.*?)$') {
$afterBr = $matches[1]
} else {
$afterStrong = [regex]::Match($pContent, '</strong>(.*?)$', $opts)
if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value }
}
$desc = Get-CleanText $afterBr
$null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc })
}
return ,$dishes
}
function Extract-CourseBlock($html, $courseLabel) {
$opts = [System.Text.RegularExpressions.RegexOptions]::Singleline
$pattern = [regex]::Escape($courseLabel) + '.{0,300}?et_pb_text_inner">(.+?)(?=<h[123]|et_pb_text_inner"><h|V:\s*<|Vegetarian item|et_pb_row_inner_[23])'
$m = [regex]::Match($html, $pattern, $opts)
if ($m.Success) { return $m.Groups[1].Value }
$idx = $html.IndexOf($courseLabel)
if ($idx -ge 0) {
$sub = $html.Substring($idx, [Math]::Min(4000, $html.Length - $idx))
$innerM = [regex]::Match($sub, 'et_pb_text_inner">(.*?)(?=et_pb_text_inner|</div></div></div>)', $opts)
if ($innerM.Success) { return $innerM.Groups[1].Value }
}
return ''
}
$nameM = [regex]::Match($html, '<title>(.+?) \| Inlander')
Write-Host "Name: $($nameM.Groups[1].Value.Trim())"
$priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>')
Write-Host "Price: $($priceM.Groups[1].Value)"
$cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z /]+?)(?:\s*</|\s*<)')
$cuisine = (Get-Culture).TextInfo.ToTitleCase($cuisineM.Groups[1].Value.Trim().ToLower())
Write-Host "Cuisine: $cuisine"
$phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}')
Write-Host "Phone: $($phoneM.Value)"
$hoursM = [regex]::Match($html, 'Menu served [^<]+')
Write-Host "Hours: $($hoursM.Value.Trim())"
$areas = @()
foreach ($aKey in $areaMap.Keys) {
if ($html.ToUpper().Contains($aKey)) { $areas += $areaMap[$aKey] }
}
Write-Host "Areas: $($areas -join ', ')"
Write-Host ""
Write-Host "--- FIRST COURSE ---"
$fc = Extract-CourseBlock $html 'First Course'
$firstCourse = Extract-Dishes $fc
foreach ($d in $firstCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
Write-Host ""
Write-Host "--- SECOND COURSE ---"
$sc = Extract-CourseBlock $html 'Second Course'
$secondCourse = Extract-Dishes $sc
foreach ($d in $secondCourse) { Write-Host " [$($d.name)] | $($d.desc)" }
Write-Host ""
Write-Host "--- THIRD COURSE ---"
$tc = Extract-CourseBlock $html 'Third Course'
$thirdCourse = Extract-Dishes $tc
foreach ($d in $thirdCourse) { Write-Host " [$($d.name)] | $($d.desc)" }