Skip to content

Commit

Permalink
Add 'http-client/publicsuffixlist/' from commit '45b882fcd53347f02f4b…
Browse files Browse the repository at this point in the history
…2f7f1107f50a41a1cfd4'

git-subtree-dir: http-client/publicsuffixlist
git-subtree-mainline: c71db8c
git-subtree-split: 45b882f
  • Loading branch information
snoyberg committed Apr 29, 2015
2 parents c71db8c + 45b882f commit a6f94e3
Show file tree
Hide file tree
Showing 12 changed files with 10,745 additions and 0 deletions.
75 changes: 75 additions & 0 deletions http-client/publicsuffixlist/Create.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
{-|
This script downloads the public suffix list from mozilla's website, and uses
Network.PublicSuffixList.Create.sink to construct an opaque data structure which can
be used with the isSuffix function in Network.PublicSuffixList.Lookup. It then
generates a source file with the contents of this data structure so that
applications can link against this source file and get knowledget of public suffixes
without doing anything at runtime.
-}

import qualified Data.ByteString as BS
import qualified Data.ByteString.UTF8 as U8
import qualified Data.Conduit as C
import qualified Data.Text as T
import Data.Time.Clock
import qualified Network.HTTP.Conduit as HC
import Data.Conduit.Binary (conduitFile)
import System.IO

import Network.PublicSuffixList.Create
import Network.PublicSuffixList.Types
import Network.PublicSuffixList.Serialize


generateDataStructure :: String -> IO (DataStructure, UTCTime)
generateDataStructure url = do
req <- HC.parseUrl url
out <- HC.withManager $ \ manager -> do
res <- HC.http req manager
HC.responseBody res C.$$+- conduitFile "effective_tld_names.dat" C.=$ sink
current_time <- getCurrentTime
putStrLn $ "Fetched Public Suffix List at " ++ show current_time
return (out, current_time)

main :: IO ()
main = do
(ds, current_time) <- generateDataStructure "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1"
withFile "Network/PublicSuffixList/DataStructure.hs" WriteMode $ \ h -> do
hPutStrLn h "{-# LANGUAGE OverloadedStrings #-}"
hPutStrLn h "{-# LANGUAGE CPP #-}"
hPutStrLn h ""
hPutStrLn h $ "-- DO NOT MODIFY! This file has been automatically generated from the Create.hs script at " ++ show current_time
hPutStrLn h ""
hPutStrLn h "module Network.PublicSuffixList.DataStructure (dataStructure) where"
hPutStrLn h ""
hPutStrLn h "import Data.ByteString.Char8 ()"
hPutStrLn h ""
hPutStrLn h "import Network.PublicSuffixList.Types"
hPutStrLn h "#if !defined(RUNTIMELIST)"
hPutStrLn h "import qualified Data.ByteString as BS"
hPutStrLn h "import Network.PublicSuffixList.Serialize"
hPutStrLn h "#else"
hPutStrLn h "import qualified Network.PublicSuffixList.Create as PSLC"
hPutStrLn h "import qualified Data.Conduit as C"
hPutStrLn h "import Data.Conduit.Binary (sourceFile)"
hPutStrLn h "import System.IO.Unsafe (unsafePerformIO)"
hPutStrLn h "#endif"
hPutStrLn h ""
hPutStrLn h "-- We could just put the raw data structure here, but if we do that, there will be lots of"
hPutStrLn h "-- static string literals, which makes GHC really slow when compiling. Instead, we can manually"
hPutStrLn h "-- serialize the datastructure ourself, so there's only one string literal."
hPutStrLn h ""
hPutStrLn h "{-|"
hPutStrLn h $ "The opaque data structure that 'isSuffix' can query. This data structure was generated at " ++ show current_time
hPutStrLn h "-}"
hPutStrLn h "dataStructure :: DataStructure"
hPutStrLn h "#if defined(RUNTIMELIST)"
hPutStrLn h "{-# NOINLINE dataStructure #-}"
hPutStrLn h "dataStructure = unsafePerformIO $ C.runResourceT $ sourceFile RUNTIMELIST C.$$ PSLC.sink"
hPutStrLn h "#else"
hPutStrLn h "dataStructure = getDataStructure serializedDataStructure"
hPutStrLn h ""
hPutStrLn h "serializedDataStructure :: BS.ByteString"
hPutStrLn h $ "serializedDataStructure = " ++ (show $ putDataStructure ds)
hPutStrLn h ""
hPutStrLn h "#endif"
129 changes: 129 additions & 0 deletions http-client/publicsuffixlist/CreateTest.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import Control.Monad.IO.Class (liftIO)
import Data.Array ((!))
import qualified Data.ByteString as BS
import qualified Data.Conduit as C
import Data.Conduit.Binary (sinkHandle)
import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Text as CT
import Data.Text (pack, unpack)
import Data.Time.Clock
import qualified Network.HTTP.Conduit as HC
import System.IO
import Text.Regex.Base.RegexLike
import Text.Regex.Posix.String
import Data.List (stripPrefix)
import Data.Maybe (fromMaybe)
import Control.Monad (guard)

header :: [String]
header = [ "{-# LANGUAGE OverloadedStrings #-}"
, ""
, "import Data.Char"
, "import Data.Maybe"
, "import qualified Data.Text as T"
, "import Debug.Trace"
, "import Network.PublicSuffixList.DataStructure"
, "import qualified Network.PublicSuffixList.Lookup as L"
, "import Network.PublicSuffixList.Serialize"
, "import System.Exit"
, "import Test.HUnit"
, "import Text.IDNA"
, ""
, ""
, "effectiveTLDPlusOne' :: T.Text -> Maybe T.Text"
, "effectiveTLDPlusOne' = L.effectiveTLDPlusOne . T.intercalate \".\" . map (fromJust . toASCII False True . T.map toLower) . T.split (== '.')"
]

header2 :: [String]
header2 = [ "hunittests :: Test"
, "hunittests = TestList ["
]

footer :: [String]
footer = [ " ]"
, ""
, "testSerializationRoundTrip = TestCase $ assertEqual \"Round Trip\" dataStructure ds"
, " where ds = getDataStructure serializedDataStructure"
, " serializedDataStructure = putDataStructure dataStructure"
, ""
, "main = do"
, " counts <- runTestTT $ TestList [TestLabel \"Mozilla Tests\" hunittests, TestLabel \"Round Trip\" testSerializationRoundTrip]"
, " if errors counts == 0 && failures counts == 0"
, " then exitSuccess"
, " else exitFailure"
]

whitespace :: String -> Bool
whitespace = matchTest regex
where regex = makeRegex "^[[:blank:]]*$" :: Regex

comment :: String -> Bool
comment = matchTest regex
where regex = makeRegex "^[[:blank:]]*//" :: Regex

nullinput :: String -> Bool
nullinput = (==) "checkPublicSuffix(null, null);"

startswithdot :: String -> Bool
startswithdot = matchTest regex
where regex = makeRegex "^checkPublicSuffix\\('\\.(.+)', (.+)\\);$" :: Regex

input :: String -> (String, Maybe String)
input s = fromMaybe (error $ "input failed on " ++ show s) $ do
s <- stripPrefix "checkPublicSuffix('" s
(l, '\'':s) <- Just $ break (== '\'') s
s <- stripPrefix ", " s
(r, s) <- Just $ break (== ')') s
guard $ s == ");"
let m = do
r <- stripPrefix "'" r
r <- stripPrefix "'" $ reverse r
Just $ reverse r
Just (l, m)

counter :: (Monad m, Num t1) => C.Conduit t m (t, t1)
counter = counterHelper 0
where counterHelper count = C.await >>= \ x -> case x of
Nothing -> return ()
Just a -> C.yield (a, count) >> counterHelper (count + 1)

intersperse :: (Monad m) => a -> C.Conduit a m a
intersperse i = C.await >>= \ x -> case x of
Nothing -> return ()
Just a -> C.yield a >> intersperseHelper
where intersperseHelper = C.await >>= \ x -> case x of
Nothing -> return ()
Just a -> C.yield i >> C.yield a >> intersperseHelper

output :: (Show t1, Show t2) => ((String, t1), t2) -> String
output ((s, b), c) = " TestCase $ assertEqual \"" ++ (show c) ++ "\" (" ++ (show b) ++ ") $ effectiveTLDPlusOne' \"" ++ s ++ "\""

populateFile :: String -> String -> IO ()
populateFile url filename = withFile filename WriteMode $ \ h -> do
current_time <- getCurrentTime
putStrLn $ "Fetched Public Suffix List at " ++ show current_time
mapM_ (hPutStrLn h) header
hPutStrLn h $ "-- DO NOT MODIFY! This file has been automatically generated from the CreateTest.hs script at " ++ show current_time
mapM_ (hPutStrLn h) header2
req <- HC.parseUrl url
HC.withManager $ \ manager -> do
res <- HC.http req manager
HC.responseBody res C.$$+-
CT.decode CT.utf8 C.=$
CT.lines C.=$
CL.map unpack C.=$
CL.filter (not . whitespace) C.=$
CL.filter (not . comment) C.=$
CL.filter (not . nullinput) C.=$
CL.filter (not . startswithdot) C.=$
CL.map input C.=$
counter C.=$
CL.map output C.=$
intersperse ",\n" C.=$
CL.map pack C.=$
CT.encode CT.utf8 C.=$
sinkHandle h
mapM_ (hPutStrLn h) footer

main :: IO ()
main = populateFile "http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1" "Test/PublicSuffixList.hs"
25 changes: 25 additions & 0 deletions http-client/publicsuffixlist/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
The following license covers this documentation, and the source code, except
where otherwise indicated.

Copyright 2012, Myles C. Maxfield. All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
72 changes: 72 additions & 0 deletions http-client/publicsuffixlist/Network/PublicSuffixList/Create.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoMonomorphismRestriction #-}
{-# LANGUAGE CPP #-}

{-|
This script parses the public suffix list, and constructs a data structure which can
be used with the isSuffix function in Lookup.hs. It exports a GSink which produces
the opaque 'DataStructure' and can be fed any Source as input.
This makes an few assumption about the information in the public suffix list:
namely, that no rule is a suffix of another rule. For example, if there is a rule
abc.def.ghi
then then is no other rule
def.ghi
or
!def.ghi
The actual data structure involved here is a tree where the nodes have no value and
the edges are DNS labels. There are two trees: one to handle the exception rules,
and one to handle the regular rules.
-}

module Network.PublicSuffixList.Create (PublicSuffixListException, sink) where

import Control.Exception
import qualified Data.ByteString as BS
import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL
import qualified Data.Conduit.Text as CT
import qualified Data.Map as M
import qualified Data.Text as T
import Data.Typeable
import Text.IDNA
import Control.Monad.Catch (MonadThrow)

import Network.PublicSuffixList.Types

data PublicSuffixListException = PublicSuffixListException
deriving (Show, Typeable)

instance Exception PublicSuffixListException

insert :: (Ord e) => Tree e -> [e] -> Tree e
insert _ [] = def
insert t (p : ps) = case M.lookup p $ children t of
Nothing -> t { children = M.insert p (insert def ps) $ children t }
Just l -> t { children = M.insert p (insert l ps) $ children t }

foldingFunction :: DataStructure -> T.Text -> DataStructure
foldingFunction d@(rules, exceptions) s'
| T.null s = d
| T.take 2 s == "//" = d
| T.head s == '!' = (rules, insert exceptions $ labelList $ T.tail s)
| otherwise = (insert rules $ labelList s, exceptions)
where ss = filter (not . T.null) $ T.words s'
s
| null ss = ""
| otherwise = head ss
labelList = reverse . map internationalize . T.split (== '.')
internationalize str
| str == "*" = str
| otherwise = case toASCII False True $ T.toLower str of
Just x -> x
Nothing -> throw PublicSuffixListException

{-
Generate the opaque 'DataStructure'
-}
sink :: MonadThrow m => C.Sink BS.ByteString m DataStructure
sink = CT.decode CT.utf8 C.=$ CT.lines C.=$ CL.fold foldingFunction (def, def)

Large diffs are not rendered by default.

88 changes: 88 additions & 0 deletions http-client/publicsuffixlist/Network/PublicSuffixList/Lookup.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{-# LANGUAGE OverloadedStrings #-}
module Network.PublicSuffixList.Lookup (effectiveTLDPlusOne, effectiveTLDPlusOne', isSuffix, isSuffix') where

import qualified Data.Map as M
import Data.Maybe (isNothing)
import qualified Data.Text as T

import qualified Network.PublicSuffixList.DataStructure as DS
import Network.PublicSuffixList.Types

{-|
OffEnd's Bool argument represents whether we fell off a
leaf or whether we fell off a non-leaf. True means that
we fell off a leaf. Its Text argument is the component
that pushed us off the end, along with all the components
to the right of that one, interspersed with "."s
-}
data LookupResult = Inside | AtLeaf | OffEnd Bool T.Text
deriving (Eq)

{-|
This function returns whether or not this domain is owned by a
registrar or a regular person. 'Nothing' means that this is a registrar
domain; 'Just x' means it's owned by a person. This is used to determine
if a cookie is allowed to bet set for a particular domain. For
example, you shouldn't be able to set a cookie for \"com\".
If the value is 'Just x', then the x value is what is known as the
effective TLD plus one. This is one segment more than the suffix of the
domain. For example, the eTLD+1 for "this.is.a.subdom.com" is Just
"subdom.com"
Note that this function expects lowercase ASCII strings. These strings
should be gotten from the toASCII algorithm as described in RFC 3490.
These strings should not start or end with the \'.\' character, and should
not have two \'.\' characters next to each other.
(The toASCII algorithm is implemented in the \'idna\' hackage package,
though that package doesn't always map strings to lowercase)
-}
effectiveTLDPlusOne' :: DataStructure -> T.Text -> Maybe T.Text
effectiveTLDPlusOne' dataStructure s
-- Any TLD is a suffix
| length ss == 1 = Nothing
| otherwise = output rulesResult exceptionResult
where ss = T.splitOn "." s
ps = reverse ss
exceptionResult = recurse ps [] $ snd dataStructure
rulesResult = recurse ps [] $ fst dataStructure
-- If we fell off, did we do it at a leaf? Otherwise, what's the
-- subtree that we're at
getNext :: Tree T.Text -> T.Text -> Either Bool (Tree T.Text)
getNext t s' = case M.lookup s' $ children t of
Nothing -> Left (M.null $ children t)
Just t' -> Right t'
-- Look up the component we're looking for...
getNextWithStar t s' = case getNext t s' of
-- and if that fails, look up "*"
Left _ -> getNext t "*"
r -> r
recurse :: [T.Text] -> [T.Text] -> Tree T.Text -> LookupResult
recurse [] _ t
| M.null $ children t = AtLeaf
| otherwise = Inside
recurse (c : cs) prev t = case getNextWithStar t c of
Left b -> OffEnd b $ T.intercalate "." (c : prev)
Right t' -> recurse cs (c : prev) t'
-- Only match against the exception rules if we have a full match
output _ AtLeaf = Just s
output _ (OffEnd True x) = Just $ T.intercalate "." $ tail $ T.splitOn "." x
-- If we have a subdomain on an existing rule, we're not a suffix
output (OffEnd _ x) _
-- A single level domain can never be a eTLD+1
| isNothing $ T.find (== '.') x = Just $ T.intercalate "." $ drop (length ss - 2) ss
| otherwise = Just x
-- Otherwise, we're a suffix of a suffix, which is a suffix
output _ _ = Nothing

-- | >>> effectiveTLDPlusOne = effectiveTLDPlusOne' Network.PublicSuffixList.DataStructure.dataStructure
effectiveTLDPlusOne :: T.Text -> Maybe T.Text
effectiveTLDPlusOne = effectiveTLDPlusOne' DS.dataStructure

-- | >>> isSuffix' dataStructure = isNothing . effectiveTLDPlusOne' dataStructure
isSuffix' :: DataStructure -> T.Text -> Bool
isSuffix' dataStructure = isNothing . effectiveTLDPlusOne' dataStructure

-- | >>> isSuffix = isSuffix' Network.PublicSuffixList.DataStructure.dataStructure
isSuffix :: T.Text -> Bool
isSuffix = isNothing . effectiveTLDPlusOne
Loading

0 comments on commit a6f94e3

Please sign in to comment.