forked from snoyberg/http-client
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add 'http-client/publicsuffixlist/' from commit '45b882fcd53347f02f4b…
…2f7f1107f50a41a1cfd4' git-subtree-dir: http-client/publicsuffixlist git-subtree-mainline: c71db8c git-subtree-split: 45b882f
- Loading branch information
Showing
12 changed files
with
10,745 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
{-| | ||
This script downloads the public suffix list from mozilla's website, and uses | ||
Network.PublicSuffixList.Create.sink to construct an opaque data structure which can | ||
be used with the isSuffix function in Network.PublicSuffixList.Lookup. It then | ||
generates a source file with the contents of this data structure so that | ||
applications can link against this source file and get knowledget of public suffixes | ||
without doing anything at runtime. | ||
-} | ||
|
||
import qualified Data.ByteString as BS | ||
import qualified Data.ByteString.UTF8 as U8 | ||
import qualified Data.Conduit as C | ||
import qualified Data.Text as T | ||
import Data.Time.Clock | ||
import qualified Network.HTTP.Conduit as HC | ||
import Data.Conduit.Binary (conduitFile) | ||
import System.IO | ||
|
||
import Network.PublicSuffixList.Create | ||
import Network.PublicSuffixList.Types | ||
import Network.PublicSuffixList.Serialize | ||
|
||
|
||
generateDataStructure :: String -> IO (DataStructure, UTCTime) | ||
generateDataStructure url = do | ||
req <- HC.parseUrl url | ||
out <- HC.withManager $ \ manager -> do | ||
res <- HC.http req manager | ||
HC.responseBody res C.$$+- conduitFile "effective_tld_names.dat" C.=$ sink | ||
current_time <- getCurrentTime | ||
putStrLn $ "Fetched Public Suffix List at " ++ show current_time | ||
return (out, current_time) | ||
|
||
main :: IO () | ||
main = do | ||
(ds, current_time) <- generateDataStructure "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1" | ||
withFile "Network/PublicSuffixList/DataStructure.hs" WriteMode $ \ h -> do | ||
hPutStrLn h "{-# LANGUAGE OverloadedStrings #-}" | ||
hPutStrLn h "{-# LANGUAGE CPP #-}" | ||
hPutStrLn h "" | ||
hPutStrLn h $ "-- DO NOT MODIFY! This file has been automatically generated from the Create.hs script at " ++ show current_time | ||
hPutStrLn h "" | ||
hPutStrLn h "module Network.PublicSuffixList.DataStructure (dataStructure) where" | ||
hPutStrLn h "" | ||
hPutStrLn h "import Data.ByteString.Char8 ()" | ||
hPutStrLn h "" | ||
hPutStrLn h "import Network.PublicSuffixList.Types" | ||
hPutStrLn h "#if !defined(RUNTIMELIST)" | ||
hPutStrLn h "import qualified Data.ByteString as BS" | ||
hPutStrLn h "import Network.PublicSuffixList.Serialize" | ||
hPutStrLn h "#else" | ||
hPutStrLn h "import qualified Network.PublicSuffixList.Create as PSLC" | ||
hPutStrLn h "import qualified Data.Conduit as C" | ||
hPutStrLn h "import Data.Conduit.Binary (sourceFile)" | ||
hPutStrLn h "import System.IO.Unsafe (unsafePerformIO)" | ||
hPutStrLn h "#endif" | ||
hPutStrLn h "" | ||
hPutStrLn h "-- We could just put the raw data structure here, but if we do that, there will be lots of" | ||
hPutStrLn h "-- static string literals, which makes GHC really slow when compiling. Instead, we can manually" | ||
hPutStrLn h "-- serialize the datastructure ourself, so there's only one string literal." | ||
hPutStrLn h "" | ||
hPutStrLn h "{-|" | ||
hPutStrLn h $ "The opaque data structure that 'isSuffix' can query. This data structure was generated at " ++ show current_time | ||
hPutStrLn h "-}" | ||
hPutStrLn h "dataStructure :: DataStructure" | ||
hPutStrLn h "#if defined(RUNTIMELIST)" | ||
hPutStrLn h "{-# NOINLINE dataStructure #-}" | ||
hPutStrLn h "dataStructure = unsafePerformIO $ C.runResourceT $ sourceFile RUNTIMELIST C.$$ PSLC.sink" | ||
hPutStrLn h "#else" | ||
hPutStrLn h "dataStructure = getDataStructure serializedDataStructure" | ||
hPutStrLn h "" | ||
hPutStrLn h "serializedDataStructure :: BS.ByteString" | ||
hPutStrLn h $ "serializedDataStructure = " ++ (show $ putDataStructure ds) | ||
hPutStrLn h "" | ||
hPutStrLn h "#endif" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import Control.Monad.IO.Class (liftIO) | ||
import Data.Array ((!)) | ||
import qualified Data.ByteString as BS | ||
import qualified Data.Conduit as C | ||
import Data.Conduit.Binary (sinkHandle) | ||
import qualified Data.Conduit.List as CL | ||
import qualified Data.Conduit.Text as CT | ||
import Data.Text (pack, unpack) | ||
import Data.Time.Clock | ||
import qualified Network.HTTP.Conduit as HC | ||
import System.IO | ||
import Text.Regex.Base.RegexLike | ||
import Text.Regex.Posix.String | ||
import Data.List (stripPrefix) | ||
import Data.Maybe (fromMaybe) | ||
import Control.Monad (guard) | ||
|
||
header :: [String] | ||
header = [ "{-# LANGUAGE OverloadedStrings #-}" | ||
, "" | ||
, "import Data.Char" | ||
, "import Data.Maybe" | ||
, "import qualified Data.Text as T" | ||
, "import Debug.Trace" | ||
, "import Network.PublicSuffixList.DataStructure" | ||
, "import qualified Network.PublicSuffixList.Lookup as L" | ||
, "import Network.PublicSuffixList.Serialize" | ||
, "import System.Exit" | ||
, "import Test.HUnit" | ||
, "import Text.IDNA" | ||
, "" | ||
, "" | ||
, "effectiveTLDPlusOne' :: T.Text -> Maybe T.Text" | ||
, "effectiveTLDPlusOne' = L.effectiveTLDPlusOne . T.intercalate \".\" . map (fromJust . toASCII False True . T.map toLower) . T.split (== '.')" | ||
] | ||
|
||
header2 :: [String] | ||
header2 = [ "hunittests :: Test" | ||
, "hunittests = TestList [" | ||
] | ||
|
||
footer :: [String] | ||
footer = [ " ]" | ||
, "" | ||
, "testSerializationRoundTrip = TestCase $ assertEqual \"Round Trip\" dataStructure ds" | ||
, " where ds = getDataStructure serializedDataStructure" | ||
, " serializedDataStructure = putDataStructure dataStructure" | ||
, "" | ||
, "main = do" | ||
, " counts <- runTestTT $ TestList [TestLabel \"Mozilla Tests\" hunittests, TestLabel \"Round Trip\" testSerializationRoundTrip]" | ||
, " if errors counts == 0 && failures counts == 0" | ||
, " then exitSuccess" | ||
, " else exitFailure" | ||
] | ||
|
||
whitespace :: String -> Bool | ||
whitespace = matchTest regex | ||
where regex = makeRegex "^[[:blank:]]*$" :: Regex | ||
|
||
comment :: String -> Bool | ||
comment = matchTest regex | ||
where regex = makeRegex "^[[:blank:]]*//" :: Regex | ||
|
||
nullinput :: String -> Bool | ||
nullinput = (==) "checkPublicSuffix(null, null);" | ||
|
||
startswithdot :: String -> Bool | ||
startswithdot = matchTest regex | ||
where regex = makeRegex "^checkPublicSuffix\\('\\.(.+)', (.+)\\);$" :: Regex | ||
|
||
input :: String -> (String, Maybe String) | ||
input s = fromMaybe (error $ "input failed on " ++ show s) $ do | ||
s <- stripPrefix "checkPublicSuffix('" s | ||
(l, '\'':s) <- Just $ break (== '\'') s | ||
s <- stripPrefix ", " s | ||
(r, s) <- Just $ break (== ')') s | ||
guard $ s == ");" | ||
let m = do | ||
r <- stripPrefix "'" r | ||
r <- stripPrefix "'" $ reverse r | ||
Just $ reverse r | ||
Just (l, m) | ||
|
||
counter :: (Monad m, Num t1) => C.Conduit t m (t, t1) | ||
counter = counterHelper 0 | ||
where counterHelper count = C.await >>= \ x -> case x of | ||
Nothing -> return () | ||
Just a -> C.yield (a, count) >> counterHelper (count + 1) | ||
|
||
intersperse :: (Monad m) => a -> C.Conduit a m a | ||
intersperse i = C.await >>= \ x -> case x of | ||
Nothing -> return () | ||
Just a -> C.yield a >> intersperseHelper | ||
where intersperseHelper = C.await >>= \ x -> case x of | ||
Nothing -> return () | ||
Just a -> C.yield i >> C.yield a >> intersperseHelper | ||
|
||
output :: (Show t1, Show t2) => ((String, t1), t2) -> String | ||
output ((s, b), c) = " TestCase $ assertEqual \"" ++ (show c) ++ "\" (" ++ (show b) ++ ") $ effectiveTLDPlusOne' \"" ++ s ++ "\"" | ||
|
||
populateFile :: String -> String -> IO () | ||
populateFile url filename = withFile filename WriteMode $ \ h -> do | ||
current_time <- getCurrentTime | ||
putStrLn $ "Fetched Public Suffix List at " ++ show current_time | ||
mapM_ (hPutStrLn h) header | ||
hPutStrLn h $ "-- DO NOT MODIFY! This file has been automatically generated from the CreateTest.hs script at " ++ show current_time | ||
mapM_ (hPutStrLn h) header2 | ||
req <- HC.parseUrl url | ||
HC.withManager $ \ manager -> do | ||
res <- HC.http req manager | ||
HC.responseBody res C.$$+- | ||
CT.decode CT.utf8 C.=$ | ||
CT.lines C.=$ | ||
CL.map unpack C.=$ | ||
CL.filter (not . whitespace) C.=$ | ||
CL.filter (not . comment) C.=$ | ||
CL.filter (not . nullinput) C.=$ | ||
CL.filter (not . startswithdot) C.=$ | ||
CL.map input C.=$ | ||
counter C.=$ | ||
CL.map output C.=$ | ||
intersperse ",\n" C.=$ | ||
CL.map pack C.=$ | ||
CT.encode CT.utf8 C.=$ | ||
sinkHandle h | ||
mapM_ (hPutStrLn h) footer | ||
|
||
main :: IO () | ||
main = populateFile "http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt?raw=1" "Test/PublicSuffixList.hs" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
The following license covers this documentation, and the source code, except | ||
where otherwise indicated. | ||
|
||
Copyright 2012, Myles C. Maxfield. All rights reserved. | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are met: | ||
|
||
* Redistributions of source code must retain the above copyright notice, this | ||
list of conditions and the following disclaimer. | ||
|
||
* Redistributions in binary form must reproduce the above copyright notice, | ||
this list of conditions and the following disclaimer in the documentation | ||
and/or other materials provided with the distribution. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS "AS IS" AND ANY EXPRESS OR | ||
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF | ||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO | ||
EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT, INDIRECT, | ||
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | ||
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, | ||
OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | ||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE | ||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF | ||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
72 changes: 72 additions & 0 deletions
72
http-client/publicsuffixlist/Network/PublicSuffixList/Create.hs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
{-# LANGUAGE DeriveDataTypeable #-} | ||
{-# LANGUAGE OverloadedStrings #-} | ||
{-# LANGUAGE NoMonomorphismRestriction #-} | ||
{-# LANGUAGE CPP #-} | ||
|
||
{-| | ||
This script parses the public suffix list, and constructs a data structure which can | ||
be used with the isSuffix function in Lookup.hs. It exports a GSink which produces | ||
the opaque 'DataStructure' and can be fed any Source as input. | ||
This makes an few assumption about the information in the public suffix list: | ||
namely, that no rule is a suffix of another rule. For example, if there is a rule | ||
abc.def.ghi | ||
then then is no other rule | ||
def.ghi | ||
or | ||
!def.ghi | ||
The actual data structure involved here is a tree where the nodes have no value and | ||
the edges are DNS labels. There are two trees: one to handle the exception rules, | ||
and one to handle the regular rules. | ||
-} | ||
|
||
module Network.PublicSuffixList.Create (PublicSuffixListException, sink) where | ||
|
||
import Control.Exception | ||
import qualified Data.ByteString as BS | ||
import qualified Data.Conduit as C | ||
import qualified Data.Conduit.List as CL | ||
import qualified Data.Conduit.Text as CT | ||
import qualified Data.Map as M | ||
import qualified Data.Text as T | ||
import Data.Typeable | ||
import Text.IDNA | ||
import Control.Monad.Catch (MonadThrow) | ||
|
||
import Network.PublicSuffixList.Types | ||
|
||
data PublicSuffixListException = PublicSuffixListException | ||
deriving (Show, Typeable) | ||
|
||
instance Exception PublicSuffixListException | ||
|
||
insert :: (Ord e) => Tree e -> [e] -> Tree e | ||
insert _ [] = def | ||
insert t (p : ps) = case M.lookup p $ children t of | ||
Nothing -> t { children = M.insert p (insert def ps) $ children t } | ||
Just l -> t { children = M.insert p (insert l ps) $ children t } | ||
|
||
foldingFunction :: DataStructure -> T.Text -> DataStructure | ||
foldingFunction d@(rules, exceptions) s' | ||
| T.null s = d | ||
| T.take 2 s == "//" = d | ||
| T.head s == '!' = (rules, insert exceptions $ labelList $ T.tail s) | ||
| otherwise = (insert rules $ labelList s, exceptions) | ||
where ss = filter (not . T.null) $ T.words s' | ||
s | ||
| null ss = "" | ||
| otherwise = head ss | ||
labelList = reverse . map internationalize . T.split (== '.') | ||
internationalize str | ||
| str == "*" = str | ||
| otherwise = case toASCII False True $ T.toLower str of | ||
Just x -> x | ||
Nothing -> throw PublicSuffixListException | ||
|
||
{- | ||
Generate the opaque 'DataStructure' | ||
-} | ||
sink :: MonadThrow m => C.Sink BS.ByteString m DataStructure | ||
sink = CT.decode CT.utf8 C.=$ CT.lines C.=$ CL.fold foldingFunction (def, def) | ||
|
38 changes: 38 additions & 0 deletions
38
http-client/publicsuffixlist/Network/PublicSuffixList/DataStructure.hs
Large diffs are not rendered by default.
Oops, something went wrong.
88 changes: 88 additions & 0 deletions
88
http-client/publicsuffixlist/Network/PublicSuffixList/Lookup.hs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
{-# LANGUAGE OverloadedStrings #-} | ||
module Network.PublicSuffixList.Lookup (effectiveTLDPlusOne, effectiveTLDPlusOne', isSuffix, isSuffix') where | ||
|
||
import qualified Data.Map as M | ||
import Data.Maybe (isNothing) | ||
import qualified Data.Text as T | ||
|
||
import qualified Network.PublicSuffixList.DataStructure as DS | ||
import Network.PublicSuffixList.Types | ||
|
||
{-| | ||
OffEnd's Bool argument represents whether we fell off a | ||
leaf or whether we fell off a non-leaf. True means that | ||
we fell off a leaf. Its Text argument is the component | ||
that pushed us off the end, along with all the components | ||
to the right of that one, interspersed with "."s | ||
-} | ||
data LookupResult = Inside | AtLeaf | OffEnd Bool T.Text | ||
deriving (Eq) | ||
|
||
{-| | ||
This function returns whether or not this domain is owned by a | ||
registrar or a regular person. 'Nothing' means that this is a registrar | ||
domain; 'Just x' means it's owned by a person. This is used to determine | ||
if a cookie is allowed to bet set for a particular domain. For | ||
example, you shouldn't be able to set a cookie for \"com\". | ||
If the value is 'Just x', then the x value is what is known as the | ||
effective TLD plus one. This is one segment more than the suffix of the | ||
domain. For example, the eTLD+1 for "this.is.a.subdom.com" is Just | ||
"subdom.com" | ||
Note that this function expects lowercase ASCII strings. These strings | ||
should be gotten from the toASCII algorithm as described in RFC 3490. | ||
These strings should not start or end with the \'.\' character, and should | ||
not have two \'.\' characters next to each other. | ||
(The toASCII algorithm is implemented in the \'idna\' hackage package, | ||
though that package doesn't always map strings to lowercase) | ||
-} | ||
effectiveTLDPlusOne' :: DataStructure -> T.Text -> Maybe T.Text | ||
effectiveTLDPlusOne' dataStructure s | ||
-- Any TLD is a suffix | ||
| length ss == 1 = Nothing | ||
| otherwise = output rulesResult exceptionResult | ||
where ss = T.splitOn "." s | ||
ps = reverse ss | ||
exceptionResult = recurse ps [] $ snd dataStructure | ||
rulesResult = recurse ps [] $ fst dataStructure | ||
-- If we fell off, did we do it at a leaf? Otherwise, what's the | ||
-- subtree that we're at | ||
getNext :: Tree T.Text -> T.Text -> Either Bool (Tree T.Text) | ||
getNext t s' = case M.lookup s' $ children t of | ||
Nothing -> Left (M.null $ children t) | ||
Just t' -> Right t' | ||
-- Look up the component we're looking for... | ||
getNextWithStar t s' = case getNext t s' of | ||
-- and if that fails, look up "*" | ||
Left _ -> getNext t "*" | ||
r -> r | ||
recurse :: [T.Text] -> [T.Text] -> Tree T.Text -> LookupResult | ||
recurse [] _ t | ||
| M.null $ children t = AtLeaf | ||
| otherwise = Inside | ||
recurse (c : cs) prev t = case getNextWithStar t c of | ||
Left b -> OffEnd b $ T.intercalate "." (c : prev) | ||
Right t' -> recurse cs (c : prev) t' | ||
-- Only match against the exception rules if we have a full match | ||
output _ AtLeaf = Just s | ||
output _ (OffEnd True x) = Just $ T.intercalate "." $ tail $ T.splitOn "." x | ||
-- If we have a subdomain on an existing rule, we're not a suffix | ||
output (OffEnd _ x) _ | ||
-- A single level domain can never be a eTLD+1 | ||
| isNothing $ T.find (== '.') x = Just $ T.intercalate "." $ drop (length ss - 2) ss | ||
| otherwise = Just x | ||
-- Otherwise, we're a suffix of a suffix, which is a suffix | ||
output _ _ = Nothing | ||
|
||
-- | >>> effectiveTLDPlusOne = effectiveTLDPlusOne' Network.PublicSuffixList.DataStructure.dataStructure | ||
effectiveTLDPlusOne :: T.Text -> Maybe T.Text | ||
effectiveTLDPlusOne = effectiveTLDPlusOne' DS.dataStructure | ||
|
||
-- | >>> isSuffix' dataStructure = isNothing . effectiveTLDPlusOne' dataStructure | ||
isSuffix' :: DataStructure -> T.Text -> Bool | ||
isSuffix' dataStructure = isNothing . effectiveTLDPlusOne' dataStructure | ||
|
||
-- | >>> isSuffix = isSuffix' Network.PublicSuffixList.DataStructure.dataStructure | ||
isSuffix :: T.Text -> Bool | ||
isSuffix = isNothing . effectiveTLDPlusOne |
Oops, something went wrong.