src/Gargantext/Text/Ngrams/Token/Text.hs

   1 {-|
   2 Module      : Gargantext.Text.Ngrams.Token.Text
   3 Description :
   4 Copyright   : (c) Grzegorz Chrupała first, after: CNRS, 2018-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Inspired from https://bitbucket.org/gchrupala/lingo/overview
  11 -}
  12
  13
  14 {-# LANGUAGE OverloadedStrings #-}
  15
  16 module Gargantext.Text.Ngrams.Token.Text
  17     ( EitherList(..)
  18     , Tokenizer
  19     , tokenize
  20     , run
  21     , defaultTokenizer
  22     , whitespace
  23     , uris
  24     , punctuation
  25     , finalPunctuation
  26     , initialPunctuation
  27     , allPunctuation
  28     , contractions
  29     , negatives
  30     )
  31   where
  32
  33 import qualified Data.Char as Char
  34 import Data.Maybe
  35 import Control.Monad
  36
  37 import Data.Text (Text)
  38 import qualified Data.Text as T
  39
  40 -- | A Tokenizer is function which takes a list and returns a list of Eithers
  41 --  (wrapped in a newtype). Right Texts will be passed on for processing
  42 --  to tokenizers down
  43 --  the pipeline. Left Texts will be passed through the pipeline unchanged.
  44 --  Use a Left Texts in a tokenizer to protect certain tokens from further
  45 --  processing (e.g. see the 'uris' tokenizer).
  46 --  You can define your own custom tokenizer pipelines by chaining tokenizers together:
  47 ---
  48 -- > myTokenizer :: Tokenizer
  49 -- > myTokenizer = whitespace >=> allPunctuation
  50 -- examples :: [Text]
  51 -- examples =
  52 --    ["This shouldn't happen."
  53 --    ,"Some 'quoted' stuff"
  54 --    ,"This is a URL: http://example.org."
  55 --    ,"How about an email@example.com"
  56 --    ,"ReferenceError #1065 broke my debugger!"
  57 --    ,"I would've gone."
  58 --    ,"They've been there."
  59 --    ,"Hyphen-words"
  60 --    ,"Yes/No questions"
  61 --    ]
  62 ---
  63
  64 type Tokenizer =  Text -> EitherList Text Text
  65
  66 -- | The EitherList is a newtype-wrapped list of Eithers.
  67 newtype EitherList a b =  E { unE :: [Either a b] }
  68
  69 -- | Split string into words using the default tokenizer pipeline
  70 tokenize :: Text -> [Text]
  71 tokenize = run defaultTokenizer
  72
  73 -- | Run a tokenizer
  74 run :: Tokenizer -> (Text -> [Text])
  75 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
  76
  77 defaultTokenizer :: Tokenizer
  78 defaultTokenizer =     whitespace
  79                    >=> uris
  80                    >=> punctuation
  81                    >=> contractions
  82                    >=> negatives
  83
  84 -- | Detect common uris and freeze them
  85 uris :: Tokenizer
  86 uris x | isUri x = E [Left x]
  87        | True    = E [Right x]
  88     where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
  89
  90 -- | Split off initial and final punctuation
  91 punctuation :: Tokenizer
  92 punctuation = finalPunctuation >=> initialPunctuation
  93
  94 --hyphens :: Tokenizer
  95 --hyphens xs = E [Right w | w <- T.split (=='-') xs ]
  96
  97 -- | Split off word-final punctuation
  98 finalPunctuation :: Tokenizer
  99 finalPunctuation x = E $ filter (not . T.null . unwrap) res
 100   where
 101     res :: [Either Text Text]
 102     res = case T.span Char.isPunctuation (T.reverse x) of
 103       (ps, w) | T.null ps -> [ Right $ T.reverse w ]
 104               | otherwise -> [ Right $ T.reverse w
 105                              , Right $ T.reverse ps]
 106       -- ([],w) -> [Right . T.reverse $ w]
 107       -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
 108
 109 -- | Split off word-initial punctuation
 110 initialPunctuation :: Tokenizer
 111 initialPunctuation x = E $ filter (not . T.null . unwrap) $
 112     case T.span Char.isPunctuation x of
 113       (ps,w) | T.null ps -> [ Right w ]
 114              | otherwise -> [ Right ps
 115                             , Right w ]
 116
 117 -- | Split tokens on transitions between punctuation and
 118 -- non-punctuation characters. This tokenizer is not included in
 119 -- defaultTokenizer pipeline because dealing with word-internal
 120 -- punctuation is quite application specific.
 121 allPunctuation :: Tokenizer
 122 allPunctuation = E . map Right
 123                  . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
 124
 125 -- | Split words ending in n't, and freeze n't
 126 negatives :: Tokenizer
 127 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
 128                                          , Left "n't" ]
 129             | True                   = E [ Right x ]
 130
 131 -- | Split common contractions off and freeze them.
 132 -- | Currently deals with: 'm, 's, 'd, 've, 'll
 133 contractions :: Tokenizer
 134 contractions x = case catMaybes . map (splitSuffix x) $ cts of
 135                    [] -> return x
 136                    ((w,s):_) -> E [ Right w,Left s]
 137     where cts = ["'m","'s","'d","'ve","'ll"]
 138           splitSuffix w sfx =
 139               let w' = T.reverse w
 140                   len = T.length sfx
 141               in if sfx `T.isSuffixOf` w
 142                  then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
 143                  else Nothing
 144
 145
 146 -- | Split string on whitespace. This is just a wrapper for Data.List.words
 147 whitespace :: Tokenizer
 148 whitespace xs = E [Right w | w <- T.words xs ]
 149
 150 instance Monad (EitherList a) where
 151     return x = E [Right x]
 152     E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
 153
 154 instance Applicative (EitherList a) where
 155     pure x = return x
 156     f <*> x = f `ap` x
 157
 158 instance Functor (EitherList a) where
 159     fmap f (E xs) = E $ (fmap . fmap) f xs
 160
 161 unwrap :: Either a a -> a
 162 unwrap (Left x) = x
 163 unwrap (Right x) = x
 164