2 Module : Gargantext.Text.Ngrams.Token.Text
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Inspired from https://bitbucket.org/gchrupala/lingo/overview
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Text.Ngrams.Token.Text
33 import qualified Data.Char as Char
37 import Data.Text (Text)
38 import qualified Data.Text as T
40 -- | A Tokenizer is function which takes a list and returns a list of Eithers
41 -- (wrapped in a newtype). Right Texts will be passed on for processing
43 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
44 -- Use a Left Texts in a tokenizer to protect certain tokens from further
45 -- processing (e.g. see the 'uris' tokenizer).
46 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
48 -- > myTokenizer :: Tokenizer
49 -- > myTokenizer = whitespace >=> allPunctuation
52 -- ["This shouldn't happen."
53 -- ,"Some 'quoted' stuff"
54 -- ,"This is a URL: http://example.org."
55 -- ,"How about an email@example.com"
56 -- ,"ReferenceError #1065 broke my debugger!"
57 -- ,"I would've gone."
58 -- ,"They've been there."
60 -- ,"Yes/No questions"
64 type Tokenizer = Text -> EitherList Text Text
66 -- | The EitherList is a newtype-wrapped list of Eithers.
67 newtype EitherList a b = E { unE :: [Either a b] }
69 -- | Split string into words using the default tokenizer pipeline
70 tokenize :: Text -> [Text]
71 tokenize = run defaultTokenizer
74 run :: Tokenizer -> (Text -> [Text])
75 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
77 defaultTokenizer :: Tokenizer
78 defaultTokenizer = whitespace
84 -- | Detect common uris and freeze them
86 uris x | isUri x = E [Left x]
88 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
90 -- | Split off initial and final punctuation
91 punctuation :: Tokenizer
92 punctuation = finalPunctuation >=> initialPunctuation
94 --hyphens :: Tokenizer
95 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
97 -- | Split off word-final punctuation
98 finalPunctuation :: Tokenizer
99 finalPunctuation x = E $ filter (not . T.null . unwrap) res
101 res :: [Either Text Text]
102 res = case T.span Char.isPunctuation (T.reverse x) of
103 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
104 | otherwise -> [ Right $ T.reverse w
105 , Right $ T.reverse ps]
106 -- ([],w) -> [Right . T.reverse $ w]
107 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
109 -- | Split off word-initial punctuation
110 initialPunctuation :: Tokenizer
111 initialPunctuation x = E $ filter (not . T.null . unwrap) $
112 case T.span Char.isPunctuation x of
113 (ps,w) | T.null ps -> [ Right w ]
114 | otherwise -> [ Right ps
117 -- | Split tokens on transitions between punctuation and
118 -- non-punctuation characters. This tokenizer is not included in
119 -- defaultTokenizer pipeline because dealing with word-internal
120 -- punctuation is quite application specific.
121 allPunctuation :: Tokenizer
122 allPunctuation = E . map Right
123 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
125 -- | Split words ending in n't, and freeze n't
126 negatives :: Tokenizer
127 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
129 | True = E [ Right x ]
131 -- | Split common contractions off and freeze them.
132 -- | Currently deals with: 'm, 's, 'd, 've, 'll
133 contractions :: Tokenizer
134 contractions x = case catMaybes . map (splitSuffix x) $ cts of
136 ((w,s):_) -> E [ Right w,Left s]
137 where cts = ["'m","'s","'d","'ve","'ll"]
141 in if sfx `T.isSuffixOf` w
142 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
146 -- | Split string on whitespace. This is just a wrapper for Data.List.words
147 whitespace :: Tokenizer
148 whitespace xs = E [Right w | w <- T.words xs ]
150 instance Monad (EitherList a) where
151 return x = E [Right x]
152 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
154 instance Applicative (EitherList a) where
158 instance Functor (EitherList a) where
159 fmap f (E xs) = E $ (fmap . fmap) f xs
161 unwrap :: Either a a -> a