# Test parsing on local 315cuisine HTML $html = [System.IO.File]::ReadAllText('C:\Users\DEREKC~1.CHN\AppData\Local\Temp\test-restaurant.html', [System.Text.Encoding]::UTF8) $areaMap = [ordered]@{ "AIRWAY HEIGHTS" = "Airway Heights" "ATHOL" = "Athol" "COEUR D'ALENE" = "Coeur d'Alene" "POST FALLS" = "Post Falls" "HAYDEN" = "Hayden" "LIBERTY LAKE" = "Liberty Lake" "NORTH SPOKANE" = "North Spokane" "SOUTH SPOKANE" = "South Spokane" "SPOKANE VALLEY" = "Spokane Valley" "WEST SPOKANE" = "West Spokane" "WORLEY" = "Worley" "DOWNTOWN" = "Downtown" } function Get-CleanText($rawHtml) { $t = $rawHtml -replace '<[^>]+>', ' ' $t = $t -replace '&', '&' $t = $t -replace '<', '<' $t = $t -replace '>', '>' $t = $t -replace '"', '"' $t = $t -replace ''', "'" $t = $t -replace ' ', ' ' $t = $t -replace '\s+', ' ' $t.Trim() } function Extract-Dishes($courseHtml) { $dishes = [System.Collections.ArrayList]@() $opts = [System.Text.RegularExpressions.RegexOptions]::Singleline $pMatches = [regex]::Matches($courseHtml, ']*>(.*?)

', $opts) foreach ($pm in $pMatches) { $pContent = $pm.Groups[1].Value if ($pContent -notmatch '') { continue } $nameM = [regex]::Match($pContent, '(.*?)', $opts) if (-not $nameM.Success) { continue } $name = Get-CleanText $nameM.Groups[1].Value if ($name -match '^(GF|GFA|V\+?|DF|V:)$') { continue } if ($name.Length -lt 3) { continue } if ($name -match '^[A-Z]{1,3}:') { continue } if ($name.Length -gt 80) { continue } $afterBr = '' if ($pContent -match '(.*?)$') { $afterBr = $matches[1] } else { $afterStrong = [regex]::Match($pContent, '(.*?)$', $opts) if ($afterStrong.Success) { $afterBr = $afterStrong.Groups[1].Value } } $desc = Get-CleanText $afterBr $null = $dishes.Add([PSCustomObject]@{ name = $name; desc = $desc }) } return ,$dishes } function Extract-CourseBlock($html, $courseLabel) { $opts = [System.Text.RegularExpressions.RegexOptions]::Singleline $pattern = [regex]::Escape($courseLabel) + '.{0,300}?et_pb_text_inner">(.+?)(?=(.*?)(?=et_pb_text_inner|)', $opts) if ($innerM.Success) { return $innerM.Groups[1].Value } } return '' } $nameM = [regex]::Match($html, '(.+?) \| Inlander') Write-Host "Name: $($nameM.Groups[1].Value.Trim())" $priceM = [regex]::Match($html, '<strong>\$(\d+)</strong>') Write-Host "Price: $($priceM.Groups[1].Value)" $cuisineM = [regex]::Match($html, 'CUISINE:\s*([A-Z][A-Za-z /]+?)(?:\s*</|\s*<)') $cuisine = (Get-Culture).TextInfo.ToTitleCase($cuisineM.Groups[1].Value.Trim().ToLower()) Write-Host "Cuisine: $cuisine" $phoneM = [regex]::Match($html, '\((?:208|509)\) \d{3}-\d{4}') Write-Host "Phone: $($phoneM.Value)" $hoursM = [regex]::Match($html, 'Menu served [^<]+') Write-Host "Hours: $($hoursM.Value.Trim())" $areas = @() foreach ($aKey in $areaMap.Keys) { if ($html.ToUpper().Contains($aKey)) { $areas += $areaMap[$aKey] } } Write-Host "Areas: $($areas -join ', ')" Write-Host "" Write-Host "--- FIRST COURSE ---" $fc = Extract-CourseBlock $html 'First Course' $firstCourse = Extract-Dishes $fc foreach ($d in $firstCourse) { Write-Host " [$($d.name)] | $($d.desc)" } Write-Host "" Write-Host "--- SECOND COURSE ---" $sc = Extract-CourseBlock $html 'Second Course' $secondCourse = Extract-Dishes $sc foreach ($d in $secondCourse) { Write-Host " [$($d.name)] | $($d.desc)" } Write-Host "" Write-Host "--- THIRD COURSE ---" $tc = Extract-CourseBlock $html 'Third Course' $thirdCourse = Extract-Dishes $tc foreach ($d in $thirdCourse) { Write-Host " [$($d.name)] | $($d.desc)" }