2 Module : Gargantext.Text.Ngrams.Token.Text
3 Description : Tokenizer main functions
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Text.Terms.Mono.Token.En
32 import Data.Foldable (concatMap)
33 import qualified Data.Char as Char
36 import Control.Applicative (Applicative)
38 import Data.Text (Text)
39 import qualified Data.Text as T
42 import Gargantext.Prelude
44 -- | A Tokenizer is function which takes a list and returns a list of Eithers
45 -- (wrapped in a newtype). Right Texts will be passed on for processing
47 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
48 -- Use a Left Texts in a tokenizer to protect certain tokens from further
49 -- processing (e.g. see the 'uris' tokenizer).
50 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
52 -- > myTokenizer :: Tokenizer
53 -- > myTokenizer = whitespace >=> allPunctuation
56 -- ["This shouldn't happen."
57 -- ,"Some 'quoted' stuff"
58 -- ,"This is a URL: http://example.org."
59 -- ,"How about an email@example.com"
60 -- ,"ReferenceError #1065 broke my debugger!"
61 -- ,"I would've gone."
62 -- ,"They've been there."
64 -- ,"Yes/No questions"
66 type Tokenizer = Text -> EitherList Text Text
68 -- | The EitherList is a newtype-wrapped list of Eithers.
69 newtype EitherList a b = E { unE :: [Either a b] }
71 -- | Split string into words using the default tokenizer pipeline
72 tokenize :: Text -> [Text]
73 tokenize = run defaultTokenizer
76 run :: Tokenizer -> (Text -> [Text])
77 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
79 defaultTokenizer :: Tokenizer
80 defaultTokenizer = whitespace
86 -- | Detect common uris and freeze them
88 uris x | isUri x = E [Left x]
90 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
92 -- | Split off initial and final punctuation
93 punctuation :: Tokenizer
94 punctuation = finalPunctuation >=> initialPunctuation
96 --hyphens :: Tokenizer
97 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
99 -- | Split off word-final punctuation
100 finalPunctuation :: Tokenizer
101 finalPunctuation x = E $ filter (not . T.null . unwrap) res
103 res :: [Either Text Text]
104 res = case T.span Char.isPunctuation (T.reverse x) of
105 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
106 | otherwise -> [ Right $ T.reverse w
107 , Right $ T.reverse ps]
108 -- ([],w) -> [Right . T.reverse $ w]
109 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
111 -- | Split off word-initial punctuation
112 initialPunctuation :: Tokenizer
113 initialPunctuation x = E $ filter (not . T.null . unwrap) $
114 case T.span Char.isPunctuation x of
115 (ps,w) | T.null ps -> [ Right w ]
116 | otherwise -> [ Right ps
119 -- | Split tokens on transitions between punctuation and
120 -- non-punctuation characters. This tokenizer is not included in
121 -- defaultTokenizer pipeline because dealing with word-internal
122 -- punctuation is quite application specific.
123 allPunctuation :: Tokenizer
124 allPunctuation = E . map Right
125 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
127 -- | Split words ending in n't, and freeze n't
128 negatives :: Tokenizer
129 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
131 | True = E [ Right x ]
133 -- | Split common contractions off and freeze them.
134 -- | Currently deals with: 'm, 's, 'd, 've, 'll
135 contractions :: Tokenizer
136 contractions x = case catMaybes . map (splitSuffix x) $ cts of
138 ((w,s):_) -> E [ Right w,Left s]
139 where cts = ["'m","'s","'d","'ve","'ll"]
143 in if sfx `T.isSuffixOf` w
144 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
148 -- | Split string on whitespace. This is just a wrapper for Data.List.words
149 whitespace :: Tokenizer
150 whitespace xs = E [Right w | w <- T.words xs ]
152 instance Monad (EitherList a) where
153 return x = E [Right x]
154 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
156 instance Applicative (EitherList a) where
160 instance Functor (EitherList a) where
161 fmap f (E xs) = E $ (fmap . fmap) f xs
163 unwrap :: Either a a -> a