2 Module : Gargantext.Ngrams.Token.Text
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Inspired from https://bitbucket.org/gchrupala/lingo/overview
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Ngrams.Token.Text
33 import qualified Data.Char as Char
35 import Control.Monad.Instances ()
36 import Control.Applicative
39 import Data.Text (Text)
40 import qualified Data.Text as T
42 -- | A Tokenizer is function which takes a list and returns a list of Eithers
43 -- (wrapped in a newtype). Right Texts will be passed on for processing
45 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
46 -- Use a Left Texts in a tokenizer to protect certain tokens from further
47 -- processing (e.g. see the 'uris' tokenizer).
48 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
50 -- > myTokenizer :: Tokenizer
51 -- > myTokenizer = whitespace >=> allPunctuation
54 type Tokenizer = Text -> EitherList Text Text
56 -- | The EitherList is a newtype-wrapped list of Eithers.
57 newtype EitherList a b = E { unE :: [Either a b] }
59 -- | Split string into words using the default tokenizer pipeline
60 tokenize :: Text -> [Text]
61 tokenize = run defaultTokenizer
64 run :: Tokenizer -> (Text -> [Text])
65 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
67 defaultTokenizer :: Tokenizer
68 defaultTokenizer = whitespace
74 -- | Detect common uris and freeze them
76 uris x | isUri x = E [Left x]
78 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
80 -- | Split off initial and final punctuation
81 punctuation :: Tokenizer
82 punctuation = finalPunctuation >=> initialPunctuation
85 hyphens xs = E [Right w | w <- T.split (=='-') xs ]
87 -- | Split off word-final punctuation
88 finalPunctuation :: Tokenizer
89 finalPunctuation x = E $ filter (not . T.null . unwrap) res
91 res :: [Either Text Text]
92 res = case T.span Char.isPunctuation (T.reverse x) of
93 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
94 | otherwise -> [ Right $ T.reverse w
95 , Right $ T.reverse ps]
96 -- ([],w) -> [Right . T.reverse $ w]
97 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
99 -- | Split off word-initial punctuation
100 initialPunctuation :: Tokenizer
101 initialPunctuation x = E $ filter (not . T.null . unwrap) $
102 case T.span Char.isPunctuation x of
103 (ps,w) | T.null ps -> [ Right w ]
104 | otherwise -> [ Right ps
107 -- | Split tokens on transitions between punctuation and
108 -- non-punctuation characters. This tokenizer is not included in
109 -- defaultTokenizer pipeline because dealing with word-internal
110 -- punctuation is quite application specific.
111 allPunctuation :: Tokenizer
112 allPunctuation = E . map Right
113 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
115 -- | Split words ending in n't, and freeze n't
116 negatives :: Tokenizer
117 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
119 | True = E [ Right x ]
121 -- | Split common contractions off and freeze them.
122 -- | Currently deals with: 'm, 's, 'd, 've, 'll
123 contractions :: Tokenizer
124 contractions x = case catMaybes . map (splitSuffix x) $ cts of
126 ((w,s):_) -> E [ Right w,Left s]
127 where cts = ["'m","'s","'d","'ve","'ll"]
131 in if sfx `T.isSuffixOf` w
132 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
136 -- | Split string on whitespace. This is just a wrapper for Data.List.words
137 whitespace :: Tokenizer
138 whitespace xs = E [Right w | w <- T.words xs ]
140 instance Monad (EitherList a) where
141 return x = E [Right x]
142 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
144 instance Applicative (EitherList a) where
148 instance Functor (EitherList a) where
149 fmap f (E xs) = E $ (fmap . fmap) f xs
151 unwrap :: Either a a -> a
157 ["This shouldn't happen."
158 ,"Some 'quoted' stuff"
159 ,"This is a URL: http://example.org."
160 ,"How about an email@example.com"
161 ,"ReferenceError #1065 broke my debugger!"
163 ,"They've been there."