Haskell: Comparing scalpel with parsec-tagsoup

Scalpel and parsec-tagsoup are two options to extract contents from HTML files. Scalpel provides are pretty high level interface via so called selectors, while tagsoup parsec utilizes the parsec parser combinator functions in combination with tagsoup.

Both of these packages allow one to extract data in between the thousands of tags not of interest. As the name suggests, scalpel is very good in cutting one very specific portions of interest, while parsec-tagsoup gives you the absolute control how to handle every single tag, at the cost of more boilerplate.

This short post will not go into detail how to use these two packages. Instead we look at two constructed scenarios and how they can be solved with both of the aforementioned packages.

Input

The inputs are very basic stripped down plain HTML files.

Input example 1

Input example 2

Scalpel

Example 1

#!/usr/bin/env stack
{- stack
  script
  --resolver lts-9.12
  --package scalpel
-}

{-# LANGUAGE OverloadedStrings #-}

import Text.HTML.Scalpel
import Control.Monad
import Control.Applicative


main :: IO ()
main = do
  exampleHtml <- readFile "example.html"
  let scrapeResults = scrapeStringLike exampleHtml altTextAndImages
  printScrapeResults scrapeResults

printScrapeResults :: Maybe [(String,String,Maybe String)] -> IO ()
printScrapeResults Nothing = putStrLn "Something went wrong!"
printScrapeResults (Just []) = putStrLn "Couldn't scrape anything!"
printScrapeResults (Just results) = forM_ results print

altTextAndImages :: Scraper String [(String,String,Maybe String)]
altTextAndImages =
    chroot ("table" @: [hasClass "interesting"])
           (chroots "tr"
                    (do
                       firstCol <- text ("td" @: ["class" @= "first-column"])
                       secondCol <- text ("td" @: [notP ("class" @= "first-column")])
                       link <- optional $ attr "href" ("td" // "a")
                       return (strip firstCol, strip secondCol, link)))
    where
      strip = unwords . words

Example 2

#!/usr/bin/env stack
{- stack
  script
  --resolver lts-9.12
  --package scalpel
-}

{-# LANGUAGE OverloadedStrings #-}

import Text.HTML.Scalpel
import Control.Monad
import Control.Applicative


main :: IO ()
main = do
  exampleHtml <- readFile "example2.html"
  let scrapeResults = scrapeStringLike exampleHtml altTextAndImages
  printScrapeResults scrapeResults

printScrapeResults Nothing = putStrLn "Something went wrong!"
printScrapeResults (Just []) = putStrLn "Couldn't scrape anything!"
printScrapeResults (Just results) = forM_ results print

altTextAndImages :: Scraper String [String]
altTextAndImages =
    chroots ("div" @: [hasClass "statistics"]) (text "h2")

Parsec Tagsoup

Example 1

#!/usr/bin/env stack
{- stack --resolver lts-9.12 runghc
  --package tagsoup
  --package parsec
  --package parsec-tagsoup
-}


import Control.Monad (forM_)
import Text.Parsec hiding (satisfy)
import Text.Parsec.String
import Text.HTML.TagSoup
import Text.ParserCombinators.Parsec.Tag


notTag :: TagRep rep => rep -> TagParser String () (Tag String)
notTag t = satisfy (~/= t) <?> ("not (" ++ show(toTagRep t :: Tag String) ++ ")")


strip = unwords . words


main :: IO ()
main = do
  exampleHtml <- readFile "example.html"
  let tags = parseTags exampleHtml
      parseResults = parse getTableRows "demo" tags
  printParseResults parseResults


printParseResults :: Either ParseError [(String,String,Maybe String)] -> IO ()
printParseResults (Left err) = print err
printParseResults (Right results) = forM_ results print


getTableRows = do
  skipMany (notTag (TagOpen "table" [("class","interesting things")]))
  tagP (TagOpen "table" []) (\_ ->
    many (tagP (TagOpen "tr" []) (\_ ->
      do
        tagOpen "td"
        (txt1,_) <- parseCol
        tagClose "td"
        tagOpen "td"
        (txt2,lnk) <- parseCol
        tagClose "td"
        return (strip txt1,strip txt2,lnk))))


parseCol = do
  optional (tagOpen "b")
  lnk <- optionMaybe(do
            lnkOpen <- tagOpen "a"
            return $ fromAttrib "href" lnkOpen)
  txt <- tagText
  optional (tagClose "a")
  optional (tagClose "b")
  return (txt,lnk)

Example 2

#!/usr/bin/env stack
{- stack --resolver lts-9.12 runghc
  --package tagsoup
  --package parsec
  --package parsec-tagsoup
-}


import Control.Monad (forM_)
import Text.Parsec hiding (satisfy)
import Text.Parsec.String
import Text.HTML.TagSoup
import Text.ParserCombinators.Parsec.Tag


notTag :: TagRep rep => rep -> TagParser String () (Tag String)
notTag t = satisfy (~/= t) <?> ("not (" ++ show(toTagRep t :: Tag String) ++ ")")


strip = unwords . words


main :: IO ()
main = do
  exampleHtml <- readFile "example2.html"
  let tags = parseTags exampleHtml
      parseResults = parse getTableHeaders "demo" tags
  printParseResults parseResults


printParseResults :: Either ParseError [String] -> IO ()
printParseResults (Left err) = print err
printParseResults (Right results) = forM_ results print


getTableHeaders = do
  skipMany (notTag (TagOpen "div" [("class","statistics")]))
  many getTableHeader


getTableHeader = do
  header <- tagP (TagOpen "div" [("class","statistics")])
                 (\_ -> do
                        tagOpen "h2"
                        header <- tagText
                        tagClose "h2"
                        return header)
  skipMany (notTag (TagOpen "div" [("class","statistics")]))
  return header

Conclusion

Scalpel really shines when we want to extract data that shares common attributes, possibly to be found scattered all over the HTML source. The really nice part is that it alleviates the programmer from having to write all the open/close tag boilerplate.

Parsec-tagsoup, backed up by the powerful parsec library, has its strong sides when dealing with very detailed localized data, or when conditional parsing is required.