src/Gargantext/Text/Terms/Mono/Token/En.hs

   1 {-|
   2 Module      : Gargantext.Text.Ngrams.Token.Text
   3 Description : Tokenizer main functions
   4 Copyright   : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
  11 -}
  12
  13 {-# LANGUAGE NoImplicitPrelude #-}
  14 {-# LANGUAGE OverloadedStrings #-}
  15
  16 module Gargantext.Text.Terms.Mono.Token.En
  17     ( EitherList(..)
  18     , Tokenizer
  19     , tokenize
  20     , defaultTokenizer
  21     , whitespace
  22     , uris
  23     , punctuation
  24     , finalPunctuation
  25     , initialPunctuation
  26     , allPunctuation
  27     , contractions
  28     , negatives
  29     )
  30   where
  31
  32 import Data.Foldable (concatMap)
  33 import qualified Data.Char as Char
  34 import Data.Maybe
  35 import Control.Monad
  36 import Control.Applicative (Applicative)
  37
  38 import Data.Text (Text)
  39 import qualified Data.Text as T
  40
  41 import Data.Either
  42 import Gargantext.Prelude
  43
  44 -- | A Tokenizer is function which takes a list and returns a list of Eithers
  45 --  (wrapped in a newtype). Right Texts will be passed on for processing
  46 --  to tokenizers down
  47 --  the pipeline. Left Texts will be passed through the pipeline unchanged.
  48 --  Use a Left Texts in a tokenizer to protect certain tokens from further
  49 --  processing (e.g. see the 'uris' tokenizer).
  50 --  You can define your own custom tokenizer pipelines by chaining tokenizers together:
  51 ---
  52 -- > myTokenizer :: Tokenizer
  53 -- > myTokenizer = whitespace >=> allPunctuation
  54 -- examples :: [Text]
  55 -- examples =
  56 --    ["This shouldn't happen."
  57 --    ,"Some 'quoted' stuff"
  58 --    ,"This is a URL: http://example.org."
  59 --    ,"How about an email@example.com"
  60 --    ,"ReferenceError #1065 broke my debugger!"
  61 --    ,"I would've gone."
  62 --    ,"They've been there."
  63 --    ,"Hyphen-words"
  64 --    ,"Yes/No questions"
  65 --    ]
  66 type Tokenizer =  Text -> EitherList Text Text
  67
  68 -- | The EitherList is a newtype-wrapped list of Eithers.
  69 newtype EitherList a b =  E { unE :: [Either a b] }
  70
  71 -- | Split string into words using the default tokenizer pipeline
  72 tokenize :: Text -> [Text]
  73 tokenize = run defaultTokenizer
  74
  75 -- | Run a tokenizer
  76 run :: Tokenizer -> (Text -> [Text])
  77 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
  78
  79 defaultTokenizer :: Tokenizer
  80 defaultTokenizer =     whitespace
  81                    >=> uris
  82                    >=> punctuation
  83                    >=> contractions
  84                    >=> negatives
  85
  86 -- | Detect common uris and freeze them
  87 uris :: Tokenizer
  88 uris x | isUri x = E [Left x]
  89        | True    = E [Right x]
  90     where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:","https://"]
  91
  92 -- | Split off initial and final punctuation
  93 punctuation :: Tokenizer
  94 punctuation = finalPunctuation >=> initialPunctuation
  95
  96 --hyphens :: Tokenizer
  97 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
  98
  99 -- | Split off word-final punctuation
 100 finalPunctuation :: Tokenizer
 101 finalPunctuation x = E $ filter (not . T.null . unwrap) res
 102   where
 103     res :: [Either Text Text]
 104     res = case T.span Char.isPunctuation (T.reverse x) of
 105       (ps, w) | T.null ps -> [ Right $ T.reverse w ]
 106               | otherwise -> [ Right $ T.reverse w
 107                              , Right $ T.reverse ps]
 108       -- ([],w) -> [Right . T.reverse $ w]
 109       -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
 110
 111 -- | Split off word-initial punctuation
 112 initialPunctuation :: Tokenizer
 113 initialPunctuation x = E $ filter (not . T.null . unwrap) $
 114     case T.span Char.isPunctuation x of
 115       (ps,w) | T.null ps -> [ Right w ]
 116              | otherwise -> [ Right ps
 117                             , Right w ]
 118
 119 -- | Split tokens on transitions between punctuation and
 120 -- non-punctuation characters. This tokenizer is not included in
 121 -- defaultTokenizer pipeline because dealing with word-internal
 122 -- punctuation is quite application specific.
 123 allPunctuation :: Tokenizer
 124 allPunctuation = E . map Right
 125                  . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
 126
 127 -- | Split words ending in n't, and freeze n't
 128 negatives :: Tokenizer
 129 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
 130                                          , Left "n't" ]
 131             | True                   = E [ Right x ]
 132
 133 -- | Split common contractions off and freeze them.
 134 -- | Currently deals with: 'm, 's, 'd, 've, 'll
 135 contractions :: Tokenizer
 136 contractions x = case catMaybes . map (splitSuffix x) $ cts of
 137                    [] -> return x
 138                    ((w,s):_) -> E [ Right w,Left s]
 139     where cts = ["'m","'s","'d","'ve","'ll"]
 140           splitSuffix w sfx =
 141               let w' = T.reverse w
 142                   len = T.length sfx
 143               in if sfx `T.isSuffixOf` w
 144                  then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
 145                  else Nothing
 146
 147
 148 -- | Split string on whitespace. This is just a wrapper for Data.List.words
 149 whitespace :: Tokenizer
 150 whitespace xs = E [Right w | w <- T.words xs ]
 151
 152 instance Monad (EitherList a) where
 153     return x = E [Right x]
 154     E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
 155
 156 instance Applicative (EitherList a) where
 157     pure x = return x
 158     f <*> x = f `ap` x
 159
 160 instance Functor (EitherList a) where
 161     fmap f (E xs) = E $ (fmap . fmap) f xs
 162
 163 unwrap :: Either a a -> a
 164 unwrap (Left x) = x
 165 unwrap (Right x) = x
 166