]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Mono/Token/En.hs
[REFACT] uniforming hyperdata fields name
[gargantext.git] / src / Gargantext / Text / Terms / Mono / Token / En.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Token.Text
3 Description : Tokenizer main functions
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
11 -}
12
13
14 module Gargantext.Text.Terms.Mono.Token.En
15 ( EitherList(..)
16 , Tokenizer
17 , tokenize
18 , defaultTokenizer
19 , whitespace
20 , uris
21 , punctuation
22 , finalPunctuation
23 , initialPunctuation
24 , allPunctuation
25 , contractions
26 , negatives
27 )
28 where
29
30 import Data.Foldable (concatMap)
31 import qualified Data.Char as Char
32 import Data.Maybe
33 import Control.Monad
34 import Control.Applicative (Applicative)
35
36 import Data.Text (Text)
37 import qualified Data.Text as T
38
39 import Data.Either
40 import Gargantext.Prelude
41
42 -- | A Tokenizer is function which takes a list and returns a list of Eithers
43 -- (wrapped in a newtype). Right Texts will be passed on for processing
44 -- to tokenizers down
45 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
46 -- Use a Left Texts in a tokenizer to protect certain tokens from further
47 -- processing (e.g. see the 'uris' tokenizer).
48 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
49 ---
50 -- > myTokenizer :: Tokenizer
51 -- > myTokenizer = whitespace >=> allPunctuation
52 -- examples :: [Text]
53 -- examples =
54 -- ["This shouldn't happen."
55 -- ,"Some 'quoted' stuff"
56 -- ,"This is a URL: http://example.org."
57 -- ,"How about an email@example.com"
58 -- ,"ReferenceError #1065 broke my debugger!"
59 -- ,"I would've gone."
60 -- ,"They've been there."
61 -- ,"Hyphen-words"
62 -- ,"Yes/No questions"
63 -- ]
64 type Tokenizer = Text -> EitherList Text Text
65
66 -- | The EitherList is a newtype-wrapped list of Eithers.
67 newtype EitherList a b = E { unE :: [Either a b] }
68
69 -- | Split string into words using the default tokenizer pipeline
70 tokenize :: Text -> [Text]
71 tokenize = run defaultTokenizer
72
73 -- | Run a tokenizer
74 run :: Tokenizer -> (Text -> [Text])
75 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
76
77 defaultTokenizer :: Tokenizer
78 defaultTokenizer = whitespace
79 >=> uris
80 >=> punctuation
81 >=> contractions
82 >=> negatives
83
84 -- | Detect common uris and freeze them
85 uris :: Tokenizer
86 uris x | isUri x = E [Left x]
87 | True = E [Right x]
88 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
89
90 -- | Split off initial and final punctuation
91 punctuation :: Tokenizer
92 punctuation = finalPunctuation >=> initialPunctuation
93
94 --hyphens :: Tokenizer
95 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
96
97 -- | Split off word-final punctuation
98 finalPunctuation :: Tokenizer
99 finalPunctuation x = E $ filter (not . T.null . unwrap) res
100 where
101 res :: [Either Text Text]
102 res = case T.span Char.isPunctuation (T.reverse x) of
103 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
104 | otherwise -> [ Right $ T.reverse w
105 , Right $ T.reverse ps]
106 -- ([],w) -> [Right . T.reverse $ w]
107 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
108
109 -- | Split off word-initial punctuation
110 initialPunctuation :: Tokenizer
111 initialPunctuation x = E $ filter (not . T.null . unwrap) $
112 case T.span Char.isPunctuation x of
113 (ps,w) | T.null ps -> [ Right w ]
114 | otherwise -> [ Right ps
115 , Right w ]
116
117 -- | Split tokens on transitions between punctuation and
118 -- non-punctuation characters. This tokenizer is not included in
119 -- defaultTokenizer pipeline because dealing with word-internal
120 -- punctuation is quite application specific.
121 allPunctuation :: Tokenizer
122 allPunctuation = E . map Right
123 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
124
125 -- | Split words ending in n't, and freeze n't
126 negatives :: Tokenizer
127 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
128 , Left "n't" ]
129 | True = E [ Right x ]
130
131 -- | Split common contractions off and freeze them.
132 -- | Currently deals with: 'm, 's, 'd, 've, 'll
133 contractions :: Tokenizer
134 contractions x = case catMaybes . map (splitSuffix x) $ cts of
135 [] -> return x
136 ((w,s):_) -> E [ Right w,Left s]
137 where cts = ["'m","'s","'d","'ve","'ll"]
138 splitSuffix w sfx =
139 let w' = T.reverse w
140 len = T.length sfx
141 in if sfx `T.isSuffixOf` w
142 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
143 else Nothing
144
145
146 -- | Split string on whitespace. This is just a wrapper for Data.List.words
147 whitespace :: Tokenizer
148 whitespace xs = E [Right w | w <- T.words xs ]
149
150 instance Monad (EitherList a) where
151 return x = E [Right x]
152 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
153
154 instance Applicative (EitherList a) where
155 pure x = return x
156 f <*> x = f `ap` x
157
158 instance Functor (EitherList a) where
159 fmap f (E xs) = E $ (fmap . fmap) f xs
160
161 unwrap :: Either a a -> a
162 unwrap (Left x) = x
163 unwrap (Right x) = x
164