src/Gargantext/Core/Text/Terms/Mono/Token/En.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams.Token.Text
   3 Description : Tokenizer main functions
   4 Copyright   : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
  11 -}
  12
  13
  14 module Gargantext.Core.Text.Terms.Mono.Token.En
  15     ( EitherList(..)
  16     , Tokenizer
  17     , tokenize
  18     , defaultTokenizer
  19     , whitespace
  20     , uris
  21     , punctuation
  22     , finalPunctuation
  23     , initialPunctuation
  24     , allPunctuation
  25     , contractions
  26     , negatives
  27     )
  28   where
  29
  30 import qualified Data.Char as Char
  31 import Data.Maybe
  32 import Control.Monad
  33 import Control.Applicative (Applicative)
  34
  35 import Data.Text (Text)
  36 import qualified Data.Text as T
  37
  38 import Data.Either
  39 import Gargantext.Prelude
  40
  41 -- | A Tokenizer is function which takes a list and returns a list of Eithers
  42 --  (wrapped in a newtype). Right Texts will be passed on for processing
  43 --  to tokenizers down
  44 --  the pipeline. Left Texts will be passed through the pipeline unchanged.
  45 --  Use a Left Texts in a tokenizer to protect certain tokens from further
  46 --  processing (e.g. see the 'uris' tokenizer).
  47 --  You can define your own custom tokenizer pipelines by chaining tokenizers together:
  48 ---
  49 -- > myTokenizer :: Tokenizer
  50 -- > myTokenizer = whitespace >=> allPunctuation
  51 -- examples :: [Text]
  52 -- examples =
  53 --    ["This shouldn't happen."
  54 --    ,"Some 'quoted' stuff"
  55 --    ,"This is a URL: http://example.org."
  56 --    ,"How about an email@example.com"
  57 --    ,"ReferenceError #1065 broke my debugger!"
  58 --    ,"I would've gone."
  59 --    ,"They've been there."
  60 --    ,"Hyphen-words"
  61 --    ,"Yes/No questions"
  62 --    ]
  63 type Tokenizer =  Text -> EitherList Text Text
  64
  65 -- | The EitherList is a newtype-wrapped list of Eithers.
  66 newtype EitherList a b =  E { unE :: [Either a b] }
  67
  68 -- | Split string into words using the default tokenizer pipeline
  69 tokenize :: Text -> [Text]
  70 tokenize = run defaultTokenizer
  71
  72 -- | Run a tokenizer
  73 run :: Tokenizer -> (Text -> [Text])
  74 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
  75
  76 defaultTokenizer :: Tokenizer
  77 defaultTokenizer =     whitespace
  78                    >=> uris
  79                    >=> punctuation
  80                    >=> contractions
  81                    >=> negatives
  82
  83 -- | Detect common uris and freeze them
  84 uris :: Tokenizer
  85 uris x | isUri x = E [Left x]
  86        | True    = E [Right x]
  87     where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
  88
  89 -- | Split off initial and final punctuation
  90 punctuation :: Tokenizer
  91 punctuation = finalPunctuation >=> initialPunctuation
  92
  93 --hyphens :: Tokenizer
  94 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
  95
  96 -- | Split off word-final punctuation
  97 finalPunctuation :: Tokenizer
  98 finalPunctuation x = E $ filter (not . T.null . unwrap) res
  99   where
 100     res :: [Either Text Text]
 101     res = case T.span Char.isPunctuation (T.reverse x) of
 102       (ps, w) | T.null ps -> [ Right $ T.reverse w ]
 103               | otherwise -> [ Right $ T.reverse w
 104                              , Right $ T.reverse ps]
 105       -- ([],w) -> [Right . T.reverse $ w]
 106       -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
 107
 108 -- | Split off word-initial punctuation
 109 initialPunctuation :: Tokenizer
 110 initialPunctuation x = E $ filter (not . T.null . unwrap) $
 111     case T.span Char.isPunctuation x of
 112       (ps,w) | T.null ps -> [ Right w ]
 113              | otherwise -> [ Right ps
 114                             , Right w ]
 115
 116 -- | Split tokens on transitions between punctuation and
 117 -- non-punctuation characters. This tokenizer is not included in
 118 -- defaultTokenizer pipeline because dealing with word-internal
 119 -- punctuation is quite application specific.
 120 allPunctuation :: Tokenizer
 121 allPunctuation = E . map Right
 122                  . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
 123
 124 -- | Split words ending in n't, and freeze n't
 125 negatives :: Tokenizer
 126 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
 127                                          , Left "n't" ]
 128             | True                   = E [ Right x ]
 129
 130 -- | Split common contractions off and freeze them.
 131 -- | Currently deals with: 'm, 's, 'd, 've, 'll
 132 contractions :: Tokenizer
 133 contractions x = case catMaybes . map (splitSuffix x) $ cts of
 134                    [] -> return x
 135                    ((w,s):_) -> E [ Right w,Left s]
 136     where cts = ["'m","'s","'d","'ve","'ll"]
 137           splitSuffix w sfx =
 138               let w' = T.reverse w
 139                   len = T.length sfx
 140               in if sfx `T.isSuffixOf` w
 141                  then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
 142                  else Nothing
 143
 144
 145 -- | Split string on whitespace. This is just a wrapper for Data.List.words
 146 whitespace :: Tokenizer
 147 whitespace xs = E [Right w | w <- T.words xs ]
 148
 149 instance Monad (EitherList a) where
 150     return x = E [Right x]
 151     E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
 152
 153 instance Applicative (EitherList a) where
 154     pure x = return x
 155     f <*> x = f `ap` x
 156
 157 instance Functor (EitherList a) where
 158     fmap f (E xs) = E $ (fmap . fmap) f xs
 159
 160 unwrap :: Either a a -> a
 161 unwrap (Left x) = x
 162 unwrap (Right x) = x
 163