]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Mono/Token/En.hs
[FIX] Arxiv api is working for first batch, not the others. I have integrated the...
[gargantext.git] / src / Gargantext / Core / Text / Terms / Mono / Token / En.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Token.Text
3 Description : Tokenizer main functions
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
11 -}
12
13
14 module Gargantext.Core.Text.Terms.Mono.Token.En
15 ( EitherList(..)
16 , Tokenizer
17 , tokenize
18 , defaultTokenizer
19 , whitespace
20 , uris
21 , punctuation
22 , finalPunctuation
23 , initialPunctuation
24 , allPunctuation
25 , contractions
26 , negatives
27 )
28 where
29
30 import qualified Data.Char as Char
31 import Data.Maybe
32 import Control.Monad
33 import Control.Applicative (Applicative)
34
35 import Data.Text (Text)
36 import qualified Data.Text as T
37
38 import Data.Either
39 import Gargantext.Prelude
40
41 -- | A Tokenizer is function which takes a list and returns a list of Eithers
42 -- (wrapped in a newtype). Right Texts will be passed on for processing
43 -- to tokenizers down
44 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
45 -- Use a Left Texts in a tokenizer to protect certain tokens from further
46 -- processing (e.g. see the 'uris' tokenizer).
47 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
48 ---
49 -- > myTokenizer :: Tokenizer
50 -- > myTokenizer = whitespace >=> allPunctuation
51 -- examples :: [Text]
52 -- examples =
53 -- ["This shouldn't happen."
54 -- ,"Some 'quoted' stuff"
55 -- ,"This is a URL: http://example.org."
56 -- ,"How about an email@example.com"
57 -- ,"ReferenceError #1065 broke my debugger!"
58 -- ,"I would've gone."
59 -- ,"They've been there."
60 -- ,"Hyphen-words"
61 -- ,"Yes/No questions"
62 -- ]
63 type Tokenizer = Text -> EitherList Text Text
64
65 -- | The EitherList is a newtype-wrapped list of Eithers.
66 newtype EitherList a b = E { unE :: [Either a b] }
67
68 -- | Split string into words using the default tokenizer pipeline
69 tokenize :: Text -> [Text]
70 tokenize = run defaultTokenizer
71
72 -- | Run a tokenizer
73 run :: Tokenizer -> (Text -> [Text])
74 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
75
76 defaultTokenizer :: Tokenizer
77 defaultTokenizer = whitespace
78 >=> uris
79 >=> punctuation
80 >=> contractions
81 >=> negatives
82
83 -- | Detect common uris and freeze them
84 uris :: Tokenizer
85 uris x | isUri x = E [Left x]
86 | True = E [Right x]
87 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
88
89 -- | Split off initial and final punctuation
90 punctuation :: Tokenizer
91 punctuation = finalPunctuation >=> initialPunctuation
92
93 --hyphens :: Tokenizer
94 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
95
96 -- | Split off word-final punctuation
97 finalPunctuation :: Tokenizer
98 finalPunctuation x = E $ filter (not . T.null . unwrap) res
99 where
100 res :: [Either Text Text]
101 res = case T.span Char.isPunctuation (T.reverse x) of
102 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
103 | otherwise -> [ Right $ T.reverse w
104 , Right $ T.reverse ps]
105 -- ([],w) -> [Right . T.reverse $ w]
106 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
107
108 -- | Split off word-initial punctuation
109 initialPunctuation :: Tokenizer
110 initialPunctuation x = E $ filter (not . T.null . unwrap) $
111 case T.span Char.isPunctuation x of
112 (ps,w) | T.null ps -> [ Right w ]
113 | otherwise -> [ Right ps
114 , Right w ]
115
116 -- | Split tokens on transitions between punctuation and
117 -- non-punctuation characters. This tokenizer is not included in
118 -- defaultTokenizer pipeline because dealing with word-internal
119 -- punctuation is quite application specific.
120 allPunctuation :: Tokenizer
121 allPunctuation = E . map Right
122 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
123
124 -- | Split words ending in n't, and freeze n't
125 negatives :: Tokenizer
126 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
127 , Left "n't" ]
128 | True = E [ Right x ]
129
130 -- | Split common contractions off and freeze them.
131 -- | Currently deals with: 'm, 's, 'd, 've, 'll
132 contractions :: Tokenizer
133 contractions x = case catMaybes . map (splitSuffix x) $ cts of
134 [] -> return x
135 ((w,s):_) -> E [ Right w,Left s]
136 where cts = ["'m","'s","'d","'ve","'ll"]
137 splitSuffix w sfx =
138 let w' = T.reverse w
139 len = T.length sfx
140 in if sfx `T.isSuffixOf` w
141 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
142 else Nothing
143
144
145 -- | Split string on whitespace. This is just a wrapper for Data.List.words
146 whitespace :: Tokenizer
147 whitespace xs = E [Right w | w <- T.words xs ]
148
149 instance Monad (EitherList a) where
150 return x = E [Right x]
151 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
152
153 instance Applicative (EitherList a) where
154 pure x = return x
155 f <*> x = f `ap` x
156
157 instance Functor (EitherList a) where
158 fmap f (E xs) = E $ (fmap . fmap) f xs
159
160 unwrap :: Either a a -> a
161 unwrap (Left x) = x
162 unwrap (Right x) = x
163