2 Module : Gargantext.Core.Text.Ngrams.Token.Text
3 Description : Tokenizer main functions
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
14 module Gargantext.Core.Text.Terms.Mono.Token.En
30 import qualified Data.Char as Char
33 import Control.Applicative (Applicative)
35 import Data.Text (Text)
36 import qualified Data.Text as T
39 import Gargantext.Prelude
41 -- | A Tokenizer is function which takes a list and returns a list of Eithers
42 -- (wrapped in a newtype). Right Texts will be passed on for processing
44 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
45 -- Use a Left Texts in a tokenizer to protect certain tokens from further
46 -- processing (e.g. see the 'uris' tokenizer).
47 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
49 -- > myTokenizer :: Tokenizer
50 -- > myTokenizer = whitespace >=> allPunctuation
53 -- ["This shouldn't happen."
54 -- ,"Some 'quoted' stuff"
55 -- ,"This is a URL: http://example.org."
56 -- ,"How about an email@example.com"
57 -- ,"ReferenceError #1065 broke my debugger!"
58 -- ,"I would've gone."
59 -- ,"They've been there."
61 -- ,"Yes/No questions"
63 type Tokenizer = Text -> EitherList Text Text
65 -- | The EitherList is a newtype-wrapped list of Eithers.
66 newtype EitherList a b = E { unE :: [Either a b] }
68 -- | Split string into words using the default tokenizer pipeline
69 tokenize :: Text -> [Text]
70 tokenize = run defaultTokenizer
73 run :: Tokenizer -> (Text -> [Text])
74 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
76 defaultTokenizer :: Tokenizer
77 defaultTokenizer = whitespace
83 -- | Detect common uris and freeze them
85 uris x | isUri x = E [Left x]
87 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
89 -- | Split off initial and final punctuation
90 punctuation :: Tokenizer
91 punctuation = finalPunctuation >=> initialPunctuation
93 --hyphens :: Tokenizer
94 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
96 -- | Split off word-final punctuation
97 finalPunctuation :: Tokenizer
98 finalPunctuation x = E $ filter (not . T.null . unwrap) res
100 res :: [Either Text Text]
101 res = case T.span Char.isPunctuation (T.reverse x) of
102 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
103 | otherwise -> [ Right $ T.reverse w
104 , Right $ T.reverse ps]
105 -- ([],w) -> [Right . T.reverse $ w]
106 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
108 -- | Split off word-initial punctuation
109 initialPunctuation :: Tokenizer
110 initialPunctuation x = E $ filter (not . T.null . unwrap) $
111 case T.span Char.isPunctuation x of
112 (ps,w) | T.null ps -> [ Right w ]
113 | otherwise -> [ Right ps
116 -- | Split tokens on transitions between punctuation and
117 -- non-punctuation characters. This tokenizer is not included in
118 -- defaultTokenizer pipeline because dealing with word-internal
119 -- punctuation is quite application specific.
120 allPunctuation :: Tokenizer
121 allPunctuation = E . map Right
122 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
124 -- | Split words ending in n't, and freeze n't
125 negatives :: Tokenizer
126 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
128 | True = E [ Right x ]
130 -- | Split common contractions off and freeze them.
131 -- | Currently deals with: 'm, 's, 'd, 've, 'll
132 contractions :: Tokenizer
133 contractions x = case catMaybes . map (splitSuffix x) $ cts of
135 ((w,s):_) -> E [ Right w,Left s]
136 where cts = ["'m","'s","'d","'ve","'ll"]
140 in if sfx `T.isSuffixOf` w
141 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
145 -- | Split string on whitespace. This is just a wrapper for Data.List.words
146 whitespace :: Tokenizer
147 whitespace xs = E [Right w | w <- T.words xs ]
149 instance Monad (EitherList a) where
150 return x = E [Right x]
151 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
153 instance Applicative (EitherList a) where
157 instance Functor (EitherList a) where
158 fmap f (E xs) = E $ (fmap . fmap) f xs
160 unwrap :: Either a a -> a