]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Token/Text.hs
[FEAT/Tokenizer] adding a great tokenizer thanks to Grzegorz Chrupała.
[gargantext.git] / src / Gargantext / Ngrams / Token / Text.hs
1 {-|
2 Module : Gargantext.Ngrams.Token.Text
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Inspired from https://bitbucket.org/gchrupala/lingo/overview
11 -}
12
13
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Ngrams.Token.Text
17 ( EitherList(..)
18 , Tokenizer
19 , tokenize
20 , run
21 , defaultTokenizer
22 , whitespace
23 , uris
24 , punctuation
25 , finalPunctuation
26 , initialPunctuation
27 , allPunctuation
28 , contractions
29 , negatives
30 )
31 where
32
33 import qualified Data.Char as Char
34 import Data.Maybe
35 import Control.Monad.Instances ()
36 import Control.Applicative
37 import Control.Monad
38
39 import Data.Text (Text)
40 import qualified Data.Text as T
41
42 -- | A Tokenizer is function which takes a list and returns a list of Eithers
43 -- (wrapped in a newtype). Right Texts will be passed on for processing
44 -- to tokenizers down
45 -- the pipeline. Left Texts will be passed through the pipeline unchanged.
46 -- Use a Left Texts in a tokenizer to protect certain tokens from further
47 -- processing (e.g. see the 'uris' tokenizer).
48 -- You can define your own custom tokenizer pipelines by chaining tokenizers together:
49 ---
50 -- > myTokenizer :: Tokenizer
51 -- > myTokenizer = whitespace >=> allPunctuation
52 ---
53
54 type Tokenizer = Text -> EitherList Text Text
55
56 -- | The EitherList is a newtype-wrapped list of Eithers.
57 newtype EitherList a b = E { unE :: [Either a b] }
58
59 -- | Split string into words using the default tokenizer pipeline
60 tokenize :: Text -> [Text]
61 tokenize = run defaultTokenizer
62
63 -- | Run a tokenizer
64 run :: Tokenizer -> (Text -> [Text])
65 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
66
67 defaultTokenizer :: Tokenizer
68 defaultTokenizer = whitespace
69 >=> uris
70 >=> punctuation
71 >=> contractions
72 >=> negatives
73
74 -- | Detect common uris and freeze them
75 uris :: Tokenizer
76 uris x | isUri x = E [Left x]
77 | True = E [Right x]
78 where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
79
80 -- | Split off initial and final punctuation
81 punctuation :: Tokenizer
82 punctuation = finalPunctuation >=> initialPunctuation
83
84 hyphens :: Tokenizer
85 hyphens xs = E [Right w | w <- T.split (=='-') xs ]
86
87 -- | Split off word-final punctuation
88 finalPunctuation :: Tokenizer
89 finalPunctuation x = E $ filter (not . T.null . unwrap) res
90 where
91 res :: [Either Text Text]
92 res = case T.span Char.isPunctuation (T.reverse x) of
93 (ps, w) | T.null ps -> [ Right $ T.reverse w ]
94 | otherwise -> [ Right $ T.reverse w
95 , Right $ T.reverse ps]
96 -- ([],w) -> [Right . T.reverse $ w]
97 -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
98
99 -- | Split off word-initial punctuation
100 initialPunctuation :: Tokenizer
101 initialPunctuation x = E $ filter (not . T.null . unwrap) $
102 case T.span Char.isPunctuation x of
103 (ps,w) | T.null ps -> [ Right w ]
104 | otherwise -> [ Right ps
105 , Right w ]
106
107 -- | Split tokens on transitions between punctuation and
108 -- non-punctuation characters. This tokenizer is not included in
109 -- defaultTokenizer pipeline because dealing with word-internal
110 -- punctuation is quite application specific.
111 allPunctuation :: Tokenizer
112 allPunctuation = E . map Right
113 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
114
115 -- | Split words ending in n't, and freeze n't
116 negatives :: Tokenizer
117 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
118 , Left "n't" ]
119 | True = E [ Right x ]
120
121 -- | Split common contractions off and freeze them.
122 -- | Currently deals with: 'm, 's, 'd, 've, 'll
123 contractions :: Tokenizer
124 contractions x = case catMaybes . map (splitSuffix x) $ cts of
125 [] -> return x
126 ((w,s):_) -> E [ Right w,Left s]
127 where cts = ["'m","'s","'d","'ve","'ll"]
128 splitSuffix w sfx =
129 let w' = T.reverse w
130 len = T.length sfx
131 in if sfx `T.isSuffixOf` w
132 then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
133 else Nothing
134
135
136 -- | Split string on whitespace. This is just a wrapper for Data.List.words
137 whitespace :: Tokenizer
138 whitespace xs = E [Right w | w <- T.words xs ]
139
140 instance Monad (EitherList a) where
141 return x = E [Right x]
142 E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
143
144 instance Applicative (EitherList a) where
145 pure x = return x
146 f <*> x = f `ap` x
147
148 instance Functor (EitherList a) where
149 fmap f (E xs) = E $ (fmap . fmap) f xs
150
151 unwrap :: Either a a -> a
152 unwrap (Left x) = x
153 unwrap (Right x) = x
154
155 examples :: [Text]
156 examples =
157 ["This shouldn't happen."
158 ,"Some 'quoted' stuff"
159 ,"This is a URL: http://example.org."
160 ,"How about an email@example.com"
161 ,"ReferenceError #1065 broke my debugger!"
162 ,"I would've gone."
163 ,"They've been there."
164 ,"Hyphen-words"
165 ,"Yes/No questions"
166 ]
167