2 Module : Gargantext.Text.Ngrams.Token.Text
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Text.Terms.Mono.Token.En
33 import Data.Foldable (concatMap)
34 import qualified Data.Char as Char
37 import Control.Applicative (Applicative)
39 import Data.Text (Text)
40 import qualified Data.Text as T
43 import Gargantext.Prelude
45 -- | A Tokenizer is function which takes a list and returns a list of Eithers
46 -- (wrapped in a newtype). Right Texts will be passed on for processing
48 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
49 -- Use a Left Texts in a tokenizer to protect certain tokens from further
50 -- processing (e.g. see the 'uris' tokenizer).
51 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
53 -- > myTokenizer :: Tokenizer
54 -- > myTokenizer = whitespace >=> allPunctuation
57 -- ["This shouldn't happen."
58 -- ,"Some 'quoted' stuff"
59 -- ,"This is a URL: http://example.org."
60 -- ,"How about an email@example.com"
61 -- ,"ReferenceError #1065 broke my debugger!"
62 -- ,"I would've gone."
63 -- ,"They've been there."
65 -- ,"Yes/No questions"
69 type Tokenizer = Text -> EitherList Text Text
71 -- | The EitherList is a newtype-wrapped list of Eithers.
72 newtype EitherList a b = E { unE :: [Either a b] }
74 -- | Split string into words using the default tokenizer pipeline
75 tokenize :: Text -> [Text]
76 tokenize = run defaultTokenizer
79 run :: Tokenizer -> (Text -> [Text])
80 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
82 defaultTokenizer :: Tokenizer
83 defaultTokenizer = whitespace
89 -- | Detect common uris and freeze them
91 uris x | isUri x = E [Left x]
93 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
95 -- | Split off initial and final punctuation
96 punctuation :: Tokenizer
97 punctuation = finalPunctuation >=> initialPunctuation
99 --hyphens :: Tokenizer
100 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
102 -- | Split off word-final punctuation
103 finalPunctuation :: Tokenizer
104 finalPunctuation x = E $ filter (not . T.null . unwrap) res
106 res :: [Either Text Text]
107 res = case T.span Char.isPunctuation (T.reverse x) of
108 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
109 | otherwise -> [ Right $ T.reverse w
110 , Right $ T.reverse ps]
111 -- ([],w) -> [Right . T.reverse $ w]
112 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
114 -- | Split off word-initial punctuation
115 initialPunctuation :: Tokenizer
116 initialPunctuation x = E $ filter (not . T.null . unwrap) $
117 case T.span Char.isPunctuation x of
118 (ps,w) | T.null ps -> [ Right w ]
119 | otherwise -> [ Right ps
122 -- | Split tokens on transitions between punctuation and
123 -- non-punctuation characters. This tokenizer is not included in
124 -- defaultTokenizer pipeline because dealing with word-internal
125 -- punctuation is quite application specific.
126 allPunctuation :: Tokenizer
127 allPunctuation = E . map Right
128 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
130 -- | Split words ending in n't, and freeze n't
131 negatives :: Tokenizer
132 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
134 | True = E [ Right x ]
136 -- | Split common contractions off and freeze them.
137 -- | Currently deals with: 'm, 's, 'd, 've, 'll
138 contractions :: Tokenizer
139 contractions x = case catMaybes . map (splitSuffix x) $ cts of
141 ((w,s):_) -> E [ Right w,Left s]
142 where cts = ["'m","'s","'d","'ve","'ll"]
146 in if sfx `T.isSuffixOf` w
147 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
151 -- | Split string on whitespace. This is just a wrapper for Data.List.words
152 whitespace :: Tokenizer
153 whitespace xs = E [Right w | w <- T.words xs ]
155 instance Monad (EitherList a) where
156 return x = E [Right x]
157 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
159 instance Applicative (EitherList a) where
163 instance Functor (EitherList a) where
164 fmap f (E xs) = E $ (fmap . fmap) f xs
166 unwrap :: Either a a -> a