]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Mono/Token/En.hs
[DEV] -> [STABLE]
[gargantext.git] / src / Gargantext / Text / Terms / Mono / Token / En.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Token.Text
3 Description : Tokenizer main functions
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Text.Terms.Mono.Token.En
17 ( EitherList(..)
18 , Tokenizer
19 , tokenize
20 , defaultTokenizer
21 , whitespace
22 , uris
23 , punctuation
24 , finalPunctuation
25 , initialPunctuation
26 , allPunctuation
27 , contractions
28 , negatives
29 )
30 where
31
32 import Data.Foldable (concatMap)
33 import qualified Data.Char as Char
34 import Data.Maybe
35 import Control.Monad
36 import Control.Applicative (Applicative)
37
38 import Data.Text (Text)
39 import qualified Data.Text as T
40
41 import Data.Either
42 import Gargantext.Prelude
43
44 -- | A Tokenizer is function which takes a list and returns a list of Eithers
45 -- (wrapped in a newtype). Right Texts will be passed on for processing
46 -- to tokenizers down
47 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
48 -- Use a Left Texts in a tokenizer to protect certain tokens from further
49 -- processing (e.g. see the 'uris' tokenizer).
50 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
51 ---
52 -- > myTokenizer :: Tokenizer
53 -- > myTokenizer = whitespace >=> allPunctuation
54 -- examples :: [Text]
55 -- examples =
56 -- ["This shouldn't happen."
57 -- ,"Some 'quoted' stuff"
58 -- ,"This is a URL: http://example.org."
59 -- ,"How about an email@example.com"
60 -- ,"ReferenceError #1065 broke my debugger!"
61 -- ,"I would've gone."
62 -- ,"They've been there."
63 -- ,"Hyphen-words"
64 -- ,"Yes/No questions"
65 -- ]
66 type Tokenizer = Text -> EitherList Text Text
67
68 -- | The EitherList is a newtype-wrapped list of Eithers.
69 newtype EitherList a b = E { unE :: [Either a b] }
70
71 -- | Split string into words using the default tokenizer pipeline
72 tokenize :: Text -> [Text]
73 tokenize = run defaultTokenizer
74
75 -- | Run a tokenizer
76 run :: Tokenizer -> (Text -> [Text])
77 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
78
79 defaultTokenizer :: Tokenizer
80 defaultTokenizer = whitespace
81 >=> uris
82 >=> punctuation
83 >=> contractions
84 >=> negatives
85
86 -- | Detect common uris and freeze them
87 uris :: Tokenizer
88 uris x | isUri x = E [Left x]
89 | True = E [Right x]
90 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
91
92 -- | Split off initial and final punctuation
93 punctuation :: Tokenizer
94 punctuation = finalPunctuation >=> initialPunctuation
95
96 --hyphens :: Tokenizer
97 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
98
99 -- | Split off word-final punctuation
100 finalPunctuation :: Tokenizer
101 finalPunctuation x = E $ filter (not . T.null . unwrap) res
102 where
103 res :: [Either Text Text]
104 res = case T.span Char.isPunctuation (T.reverse x) of
105 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
106 | otherwise -> [ Right $ T.reverse w
107 , Right $ T.reverse ps]
108 -- ([],w) -> [Right . T.reverse $ w]
109 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
110
111 -- | Split off word-initial punctuation
112 initialPunctuation :: Tokenizer
113 initialPunctuation x = E $ filter (not . T.null . unwrap) $
114 case T.span Char.isPunctuation x of
115 (ps,w) | T.null ps -> [ Right w ]
116 | otherwise -> [ Right ps
117 , Right w ]
118
119 -- | Split tokens on transitions between punctuation and
120 -- non-punctuation characters. This tokenizer is not included in
121 -- defaultTokenizer pipeline because dealing with word-internal
122 -- punctuation is quite application specific.
123 allPunctuation :: Tokenizer
124 allPunctuation = E . map Right
125 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
126
127 -- | Split words ending in n't, and freeze n't
128 negatives :: Tokenizer
129 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
130 , Left "n't" ]
131 | True = E [ Right x ]
132
133 -- | Split common contractions off and freeze them.
134 -- | Currently deals with: 'm, 's, 'd, 've, 'll
135 contractions :: Tokenizer
136 contractions x = case catMaybes . map (splitSuffix x) $ cts of
137 [] -> return x
138 ((w,s):_) -> E [ Right w,Left s]
139 where cts = ["'m","'s","'d","'ve","'ll"]
140 splitSuffix w sfx =
141 let w' = T.reverse w
142 len = T.length sfx
143 in if sfx `T.isSuffixOf` w
144 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
145 else Nothing
146
147
148 -- | Split string on whitespace. This is just a wrapper for Data.List.words
149 whitespace :: Tokenizer
150 whitespace xs = E [Right w | w <- T.words xs ]
151
152 instance Monad (EitherList a) where
153 return x = E [Right x]
154 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
155
156 instance Applicative (EitherList a) where
157 pure x = return x
158 f <*> x = f `ap` x
159
160 instance Functor (EitherList a) where
161 fmap f (E xs) = E $ (fmap . fmap) f xs
162
163 unwrap :: Either a a -> a
164 unwrap (Left x) = x
165 unwrap (Right x) = x
166