]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Mono/Token/En.hs
[MERGE] distances for Graph.
[gargantext.git] / src / Gargantext / Text / Terms / Mono / Token / En.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Token.Text
3 Description :
4 Copyright : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Text.Terms.Mono.Token.En
17 ( EitherList(..)
18 , Tokenizer
19 , tokenize
20 , run
21 , defaultTokenizer
22 , whitespace
23 , uris
24 , punctuation
25 , finalPunctuation
26 , initialPunctuation
27 , allPunctuation
28 , contractions
29 , negatives
30 )
31 where
32
33 import Data.Foldable (concatMap)
34 import qualified Data.Char as Char
35 import Data.Maybe
36 import Control.Monad
37 import Control.Applicative (Applicative)
38
39 import Data.Text (Text)
40 import qualified Data.Text as T
41
42 import Data.Either
43 import Gargantext.Prelude
44
45 -- | A Tokenizer is function which takes a list and returns a list of Eithers
46 -- (wrapped in a newtype). Right Texts will be passed on for processing
47 -- to tokenizers down
48 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
49 -- Use a Left Texts in a tokenizer to protect certain tokens from further
50 -- processing (e.g. see the 'uris' tokenizer).
51 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
52 ---
53 -- > myTokenizer :: Tokenizer
54 -- > myTokenizer = whitespace >=> allPunctuation
55 -- examples :: [Text]
56 -- examples =
57 -- ["This shouldn't happen."
58 -- ,"Some 'quoted' stuff"
59 -- ,"This is a URL: http://example.org."
60 -- ,"How about an email@example.com"
61 -- ,"ReferenceError #1065 broke my debugger!"
62 -- ,"I would've gone."
63 -- ,"They've been there."
64 -- ,"Hyphen-words"
65 -- ,"Yes/No questions"
66 -- ]
67 ---
68
69 type Tokenizer = Text -> EitherList Text Text
70
71 -- | The EitherList is a newtype-wrapped list of Eithers.
72 newtype EitherList a b = E { unE :: [Either a b] }
73
74 -- | Split string into words using the default tokenizer pipeline
75 tokenize :: Text -> [Text]
76 tokenize = run defaultTokenizer
77
78 -- | Run a tokenizer
79 run :: Tokenizer -> (Text -> [Text])
80 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
81
82 defaultTokenizer :: Tokenizer
83 defaultTokenizer = whitespace
84 >=> uris
85 >=> punctuation
86 >=> contractions
87 >=> negatives
88
89 -- | Detect common uris and freeze them
90 uris :: Tokenizer
91 uris x | isUri x = E [Left x]
92 | True = E [Right x]
93 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
94
95 -- | Split off initial and final punctuation
96 punctuation :: Tokenizer
97 punctuation = finalPunctuation >=> initialPunctuation
98
99 --hyphens :: Tokenizer
100 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
101
102 -- | Split off word-final punctuation
103 finalPunctuation :: Tokenizer
104 finalPunctuation x = E $ filter (not . T.null . unwrap) res
105 where
106 res :: [Either Text Text]
107 res = case T.span Char.isPunctuation (T.reverse x) of
108 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
109 | otherwise -> [ Right $ T.reverse w
110 , Right $ T.reverse ps]
111 -- ([],w) -> [Right . T.reverse $ w]
112 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
113
114 -- | Split off word-initial punctuation
115 initialPunctuation :: Tokenizer
116 initialPunctuation x = E $ filter (not . T.null . unwrap) $
117 case T.span Char.isPunctuation x of
118 (ps,w) | T.null ps -> [ Right w ]
119 | otherwise -> [ Right ps
120 , Right w ]
121
122 -- | Split tokens on transitions between punctuation and
123 -- non-punctuation characters. This tokenizer is not included in
124 -- defaultTokenizer pipeline because dealing with word-internal
125 -- punctuation is quite application specific.
126 allPunctuation :: Tokenizer
127 allPunctuation = E . map Right
128 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
129
130 -- | Split words ending in n't, and freeze n't
131 negatives :: Tokenizer
132 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
133 , Left "n't" ]
134 | True = E [ Right x ]
135
136 -- | Split common contractions off and freeze them.
137 -- | Currently deals with: 'm, 's, 'd, 've, 'll
138 contractions :: Tokenizer
139 contractions x = case catMaybes . map (splitSuffix x) $ cts of
140 [] -> return x
141 ((w,s):_) -> E [ Right w,Left s]
142 where cts = ["'m","'s","'d","'ve","'ll"]
143 splitSuffix w sfx =
144 let w' = T.reverse w
145 len = T.length sfx
146 in if sfx `T.isSuffixOf` w
147 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
148 else Nothing
149
150
151 -- | Split string on whitespace. This is just a wrapper for Data.List.words
152 whitespace :: Tokenizer
153 whitespace xs = E [Right w | w <- T.words xs ]
154
155 instance Monad (EitherList a) where
156 return x = E [Right x]
157 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
158
159 instance Applicative (EitherList a) where
160 pure x = return x
161 f <*> x = f `ap` x
162
163 instance Functor (EitherList a) where
164 fmap f (E xs) = E $ (fmap . fmap) f xs
165
166 unwrap :: Either a a -> a
167 unwrap (Left x) = x
168 unwrap (Right x) = x
169