src/Gargantext/Core/Text/Terms/Mono/Token/En.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Token.Text
   3 Description : Tokenizer main functions
   4 Copyright   : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
  11 -}
  12
  13
  14 module Gargantext.Core.Text.Terms.Mono.Token.En
  15     ( EitherList(..)
  16     , Tokenizer
  17     , tokenize
  18     , defaultTokenizer
  19     , whitespace
  20     , uris
  21     , punctuation
  22     , finalPunctuation
  23     , initialPunctuation
  24     , allPunctuation
  25     , contractions
  26     , negatives
  27     )
  28   where
  29
  30 import Data.Foldable (concatMap)
  31 import qualified Data.Char as Char
  32 import Data.Maybe
  33 import Control.Monad
  34 import Control.Applicative (Applicative)
  35
  36 import Data.Text (Text)
  37 import qualified Data.Text as T
  38
  39 import Data.Either
  40 import Gargantext.Prelude
  41
  42 -- | A Tokenizer is function which takes a list and returns a list of Eithers
  43 --  (wrapped in a newtype). Right Texts will be passed on for processing
  44 --  to tokenizers down
  45 --  the pipeline. Left Texts will be passed through the pipeline unchanged.
  46 --  Use a Left Texts in a tokenizer to protect certain tokens from further
  47 --  processing (e.g. see the 'uris' tokenizer).
  48 --  You can define your own custom tokenizer pipelines by chaining tokenizers together:
  49 ---
  50 -- > myTokenizer :: Tokenizer
  51 -- > myTokenizer = whitespace >=> allPunctuation
  52 -- examples :: [Text]
  53 -- examples =
  54 --    ["This shouldn't happen."
  55 --    ,"Some 'quoted' stuff"
  56 --    ,"This is a URL: http://example.org."
  57 --    ,"How about an email@example.com"
  58 --    ,"ReferenceError #1065 broke my debugger!"
  59 --    ,"I would've gone."
  60 --    ,"They've been there."
  61 --    ,"Hyphen-words"
  62 --    ,"Yes/No questions"
  63 --    ]
  64 type Tokenizer =  Text -> EitherList Text Text
  65
  66 -- | The EitherList is a newtype-wrapped list of Eithers.
  67 newtype EitherList a b =  E { unE :: [Either a b] }
  68
  69 -- | Split string into words using the default tokenizer pipeline
  70 tokenize :: Text -> [Text]
  71 tokenize = run defaultTokenizer
  72
  73 -- | Run a tokenizer
  74 run :: Tokenizer -> (Text -> [Text])
  75 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
  76
  77 defaultTokenizer :: Tokenizer
  78 defaultTokenizer =     whitespace
  79                    >=> uris
  80                    >=> punctuation
  81                    >=> contractions
  82                    >=> negatives
  83
  84 -- | Detect common uris and freeze them
  85 uris :: Tokenizer
  86 uris x | isUri x = E [Left x]
  87        | True    = E [Right x]
  88     where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
  89
  90 -- | Split off initial and final punctuation
  91 punctuation :: Tokenizer
  92 punctuation = finalPunctuation >=> initialPunctuation
  93
  94 --hyphens :: Tokenizer
  95 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
  96
  97 -- | Split off word-final punctuation
  98 finalPunctuation :: Tokenizer
  99 finalPunctuation x = E $ filter (not . T.null . unwrap) res
 100   where
 101     res :: [Either Text Text]
 102     res = case T.span Char.isPunctuation (T.reverse x) of
 103       (ps, w) | T.null ps -> [ Right $ T.reverse w ]
 104               | otherwise -> [ Right $ T.reverse w
 105                              , Right $ T.reverse ps]
 106       -- ([],w) -> [Right . T.reverse $ w]
 107       -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
 108
 109 -- | Split off word-initial punctuation
 110 initialPunctuation :: Tokenizer
 111 initialPunctuation x = E $ filter (not . T.null . unwrap) $
 112     case T.span Char.isPunctuation x of
 113       (ps,w) | T.null ps -> [ Right w ]
 114              | otherwise -> [ Right ps
 115                             , Right w ]
 116
 117 -- | Split tokens on transitions between punctuation and
 118 -- non-punctuation characters. This tokenizer is not included in
 119 -- defaultTokenizer pipeline because dealing with word-internal
 120 -- punctuation is quite application specific.
 121 allPunctuation :: Tokenizer
 122 allPunctuation = E . map Right
 123                  . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
 124
 125 -- | Split words ending in n't, and freeze n't
 126 negatives :: Tokenizer
 127 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
 128                                          , Left "n't" ]
 129             | True                   = E [ Right x ]
 130
 131 -- | Split common contractions off and freeze them.
 132 -- | Currently deals with: 'm, 's, 'd, 've, 'll
 133 contractions :: Tokenizer
 134 contractions x = case catMaybes . map (splitSuffix x) $ cts of
 135                    [] -> return x
 136                    ((w,s):_) -> E [ Right w,Left s]
 137     where cts = ["'m","'s","'d","'ve","'ll"]
 138           splitSuffix w sfx =
 139               let w' = T.reverse w
 140                   len = T.length sfx
 141               in if sfx `T.isSuffixOf` w
 142                  then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
 143                  else Nothing
 144
 145
 146 -- | Split string on whitespace. This is just a wrapper for Data.List.words
 147 whitespace :: Tokenizer
 148 whitespace xs = E [Right w | w <- T.words xs ]
 149
 150 instance Monad (EitherList a) where
 151     return x = E [Right x]
 152     E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
 153
 154 instance Applicative (EitherList a) where
 155     pure x = return x
 156     f <*> x = f `ap` x
 157
 158 instance Functor (EitherList a) where
 159     fmap f (E xs) = E $ (fmap . fmap) f xs
 160
 161 unwrap :: Either a a -> a
 162 unwrap (Left x) = x
 163 unwrap (Right x) = x
 164