src/Gargantext/Text/Terms/Mono/Token/En.hs

   1 {-|
   2 Module      : Gargantext.Text.Ngrams.Token.Text
   3 Description :
   4 Copyright   : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 First inspired from https://bitbucket.org/gchrupala/lingo/overview
  11 -}
  12
  13 {-# LANGUAGE NoImplicitPrelude #-}
  14 {-# LANGUAGE OverloadedStrings #-}
  15
  16 module Gargantext.Text.Terms.Mono.Token.En
  17     ( EitherList(..)
  18     , Tokenizer
  19     , tokenize
  20     , run
  21     , defaultTokenizer
  22     , whitespace
  23     , uris
  24     , punctuation
  25     , finalPunctuation
  26     , initialPunctuation
  27     , allPunctuation
  28     , contractions
  29     , negatives
  30     )
  31   where
  32
  33 import Data.Foldable (concatMap)
  34 import qualified Data.Char as Char
  35 import Data.Maybe
  36 import Control.Monad
  37 import Control.Applicative (Applicative)
  38
  39 import Data.Text (Text)
  40 import qualified Data.Text as T
  41
  42 import Data.Either
  43 import Gargantext.Prelude
  44
  45 -- | A Tokenizer is function which takes a list and returns a list of Eithers
  46 --  (wrapped in a newtype). Right Texts will be passed on for processing
  47 --  to tokenizers down
  48 --  the pipeline. Left Texts will be passed through the pipeline unchanged.
  49 --  Use a Left Texts in a tokenizer to protect certain tokens from further
  50 --  processing (e.g. see the 'uris' tokenizer).
  51 --  You can define your own custom tokenizer pipelines by chaining tokenizers together:
  52 ---
  53 -- > myTokenizer :: Tokenizer
  54 -- > myTokenizer = whitespace >=> allPunctuation
  55 -- examples :: [Text]
  56 -- examples =
  57 --    ["This shouldn't happen."
  58 --    ,"Some 'quoted' stuff"
  59 --    ,"This is a URL: http://example.org."
  60 --    ,"How about an email@example.com"
  61 --    ,"ReferenceError #1065 broke my debugger!"
  62 --    ,"I would've gone."
  63 --    ,"They've been there."
  64 --    ,"Hyphen-words"
  65 --    ,"Yes/No questions"
  66 --    ]
  67 ---
  68
  69 type Tokenizer =  Text -> EitherList Text Text
  70
  71 -- | The EitherList is a newtype-wrapped list of Eithers.
  72 newtype EitherList a b =  E { unE :: [Either a b] }
  73
  74 -- | Split string into words using the default tokenizer pipeline
  75 tokenize :: Text -> [Text]
  76 tokenize = run defaultTokenizer
  77
  78 -- | Run a tokenizer
  79 run :: Tokenizer -> (Text -> [Text])
  80 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
  81
  82 defaultTokenizer :: Tokenizer
  83 defaultTokenizer =     whitespace
  84                    >=> uris
  85                    >=> punctuation
  86                    >=> contractions
  87                    >=> negatives
  88
  89 -- | Detect common uris and freeze them
  90 uris :: Tokenizer
  91 uris x | isUri x = E [Left x]
  92        | True    = E [Right x]
  93     where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
  94
  95 -- | Split off initial and final punctuation
  96 punctuation :: Tokenizer
  97 punctuation = finalPunctuation >=> initialPunctuation
  98
  99 --hyphens :: Tokenizer
 100 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
 101
 102 -- | Split off word-final punctuation
 103 finalPunctuation :: Tokenizer
 104 finalPunctuation x = E $ filter (not . T.null . unwrap) res
 105   where
 106     res :: [Either Text Text]
 107     res = case T.span Char.isPunctuation (T.reverse x) of
 108       (ps, w) | T.null ps -> [ Right $ T.reverse w ]
 109               | otherwise -> [ Right $ T.reverse w
 110                              , Right $ T.reverse ps]
 111       -- ([],w) -> [Right . T.reverse $ w]
 112       -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
 113
 114 -- | Split off word-initial punctuation
 115 initialPunctuation :: Tokenizer
 116 initialPunctuation x = E $ filter (not . T.null . unwrap) $
 117     case T.span Char.isPunctuation x of
 118       (ps,w) | T.null ps -> [ Right w ]
 119              | otherwise -> [ Right ps
 120                             , Right w ]
 121
 122 -- | Split tokens on transitions between punctuation and
 123 -- non-punctuation characters. This tokenizer is not included in
 124 -- defaultTokenizer pipeline because dealing with word-internal
 125 -- punctuation is quite application specific.
 126 allPunctuation :: Tokenizer
 127 allPunctuation = E . map Right
 128                  . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
 129
 130 -- | Split words ending in n't, and freeze n't
 131 negatives :: Tokenizer
 132 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
 133                                          , Left "n't" ]
 134             | True                   = E [ Right x ]
 135
 136 -- | Split common contractions off and freeze them.
 137 -- | Currently deals with: 'm, 's, 'd, 've, 'll
 138 contractions :: Tokenizer
 139 contractions x = case catMaybes . map (splitSuffix x) $ cts of
 140                    [] -> return x
 141                    ((w,s):_) -> E [ Right w,Left s]
 142     where cts = ["'m","'s","'d","'ve","'ll"]
 143           splitSuffix w sfx =
 144               let w' = T.reverse w
 145                   len = T.length sfx
 146               in if sfx `T.isSuffixOf` w
 147                  then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
 148                  else Nothing
 149
 150
 151 -- | Split string on whitespace. This is just a wrapper for Data.List.words
 152 whitespace :: Tokenizer
 153 whitespace xs = E [Right w | w <- T.words xs ]
 154
 155 instance Monad (EitherList a) where
 156     return x = E [Right x]
 157     E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
 158
 159 instance Applicative (EitherList a) where
 160     pure x = return x
 161     f <*> x = f `ap` x
 162
 163 instance Functor (EitherList a) where
 164     fmap f (E xs) = E $ (fmap . fmap) f xs
 165
 166 unwrap :: Either a a -> a
 167 unwrap (Left x) = x
 168 unwrap (Right x) = x
 169