]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Token/Text.hs
[Structure] Ngrams -> Text.
[gargantext.git] / src / Gargantext / Text / Token / Text.hs
1 {-|
2 Module : Gargantext.Text.Token.Text
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Inspired from https://bitbucket.org/gchrupala/lingo/overview
11 -}
12
13
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Text.Token.Text
17 ( EitherList(..)
18 , Tokenizer
19 , tokenize
20 , run
21 , defaultTokenizer
22 , whitespace
23 , uris
24 , punctuation
25 , finalPunctuation
26 , initialPunctuation
27 , allPunctuation
28 , contractions
29 , negatives
30 )
31 where
32
33 import qualified Data.Char as Char
34 import Data.Maybe
35 import Control.Monad
36
37 import Data.Text (Text)
38 import qualified Data.Text as T
39
40 -- | A Tokenizer is function which takes a list and returns a list of Eithers
41 -- (wrapped in a newtype). Right Texts will be passed on for processing
42 -- to tokenizers down
43 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
44 -- Use a Left Texts in a tokenizer to protect certain tokens from further
45 -- processing (e.g. see the 'uris' tokenizer).
46 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
47 ---
48 -- > myTokenizer :: Tokenizer
49 -- > myTokenizer = whitespace >=> allPunctuation
50 -- examples :: [Text]
51 -- examples =
52 -- ["This shouldn't happen."
53 -- ,"Some 'quoted' stuff"
54 -- ,"This is a URL: http://example.org."
55 -- ,"How about an email@example.com"
56 -- ,"ReferenceError #1065 broke my debugger!"
57 -- ,"I would've gone."
58 -- ,"They've been there."
59 -- ,"Hyphen-words"
60 -- ,"Yes/No questions"
61 -- ]
62 ---
63
64 type Tokenizer = Text -> EitherList Text Text
65
66 -- | The EitherList is a newtype-wrapped list of Eithers.
67 newtype EitherList a b = E { unE :: [Either a b] }
68
69 -- | Split string into words using the default tokenizer pipeline
70 tokenize :: Text -> [Text]
71 tokenize = run defaultTokenizer
72
73 -- | Run a tokenizer
74 run :: Tokenizer -> (Text -> [Text])
75 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
76
77 defaultTokenizer :: Tokenizer
78 defaultTokenizer = whitespace
79 >=> uris
80 >=> punctuation
81 >=> contractions
82 >=> negatives
83
84 -- | Detect common uris and freeze them
85 uris :: Tokenizer
86 uris x | isUri x = E [Left x]
87 | True = E [Right x]
88 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
89
90 -- | Split off initial and final punctuation
91 punctuation :: Tokenizer
92 punctuation = finalPunctuation >=> initialPunctuation
93
94 --hyphens :: Tokenizer
95 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
96
97 -- | Split off word-final punctuation
98 finalPunctuation :: Tokenizer
99 finalPunctuation x = E $ filter (not . T.null . unwrap) res
100 where
101 res :: [Either Text Text]
102 res = case T.span Char.isPunctuation (T.reverse x) of
103 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
104 | otherwise -> [ Right $ T.reverse w
105 , Right $ T.reverse ps]
106 -- ([],w) -> [Right . T.reverse $ w]
107 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
108
109 -- | Split off word-initial punctuation
110 initialPunctuation :: Tokenizer
111 initialPunctuation x = E $ filter (not . T.null . unwrap) $
112 case T.span Char.isPunctuation x of
113 (ps,w) | T.null ps -> [ Right w ]
114 | otherwise -> [ Right ps
115 , Right w ]
116
117 -- | Split tokens on transitions between punctuation and
118 -- non-punctuation characters. This tokenizer is not included in
119 -- defaultTokenizer pipeline because dealing with word-internal
120 -- punctuation is quite application specific.
121 allPunctuation :: Tokenizer
122 allPunctuation = E . map Right
123 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
124
125 -- | Split words ending in n't, and freeze n't
126 negatives :: Tokenizer
127 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
128 , Left "n't" ]
129 | True = E [ Right x ]
130
131 -- | Split common contractions off and freeze them.
132 -- | Currently deals with: 'm, 's, 'd, 've, 'll
133 contractions :: Tokenizer
134 contractions x = case catMaybes . map (splitSuffix x) $ cts of
135 [] -> return x
136 ((w,s):_) -> E [ Right w,Left s]
137 where cts = ["'m","'s","'d","'ve","'ll"]
138 splitSuffix w sfx =
139 let w' = T.reverse w
140 len = T.length sfx
141 in if sfx `T.isSuffixOf` w
142 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
143 else Nothing
144
145
146 -- | Split string on whitespace. This is just a wrapper for Data.List.words
147 whitespace :: Tokenizer
148 whitespace xs = E [Right w | w <- T.words xs ]
149
150 instance Monad (EitherList a) where
151 return x = E [Right x]
152 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
153
154 instance Applicative (EitherList a) where
155 pure x = return x
156 f <*> x = f `ap` x
157
158 instance Functor (EitherList a) where
159 fmap f (E xs) = E $ (fmap . fmap) f xs
160
161 unwrap :: Either a a -> a
162 unwrap (Left x) = x
163 unwrap (Right x) = x
164