src/Gargantext/Ngrams/Token/Text.hs

   1 {-|
   2 Module      : Gargantext.Ngrams.Token.Text
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Inspired from https://bitbucket.org/gchrupala/lingo/overview
  11 -}
  12
  13
  14 {-# LANGUAGE OverloadedStrings #-}
  15
  16 module Gargantext.Ngrams.Token.Text
  17     ( EitherList(..)
  18     , Tokenizer
  19     , tokenize
  20     , run
  21     , defaultTokenizer
  22     , whitespace
  23     , uris
  24     , punctuation
  25     , finalPunctuation
  26     , initialPunctuation
  27     , allPunctuation
  28     , contractions
  29     , negatives
  30     )
  31 where
  32
  33 import qualified Data.Char as Char
  34 import Data.Maybe
  35 import Control.Monad.Instances ()
  36 import Control.Applicative
  37 import Control.Monad
  38
  39 import Data.Text (Text)
  40 import qualified Data.Text as T
  41
  42 -- | A Tokenizer is function which takes a list and returns a list of Eithers
  43 --  (wrapped in a newtype). Right Texts will be passed on for processing
  44 --  to tokenizers down
  45 --  the pipeline. Left Texts will be passed through the pipeline unchanged.
  46 --  Use a Left Texts in a tokenizer to protect certain tokens from further
  47 --  processing (e.g. see the 'uris' tokenizer).
  48 --  You can define your own custom tokenizer pipelines by chaining tokenizers together:
  49 ---
  50 -- > myTokenizer :: Tokenizer
  51 -- > myTokenizer = whitespace >=> allPunctuation
  52 ---
  53
  54 type Tokenizer =  Text -> EitherList Text Text
  55
  56 -- | The EitherList is a newtype-wrapped list of Eithers.
  57 newtype EitherList a b =  E { unE :: [Either a b] }
  58
  59 -- | Split string into words using the default tokenizer pipeline
  60 tokenize :: Text -> [Text]
  61 tokenize = run defaultTokenizer
  62
  63 -- | Run a tokenizer
  64 run :: Tokenizer -> (Text -> [Text])
  65 run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
  66
  67 defaultTokenizer :: Tokenizer
  68 defaultTokenizer =     whitespace
  69                    >=> uris
  70                    >=> punctuation
  71                    >=> contractions
  72                    >=> negatives
  73
  74 -- | Detect common uris and freeze them
  75 uris :: Tokenizer
  76 uris x | isUri x = E [Left x]
  77        | True    = E [Right x]
  78     where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
  79
  80 -- | Split off initial and final punctuation
  81 punctuation :: Tokenizer
  82 punctuation = finalPunctuation >=> initialPunctuation
  83
  84 hyphens :: Tokenizer
  85 hyphens xs = E [Right w | w <- T.split (=='-') xs ]
  86
  87 -- | Split off word-final punctuation
  88 finalPunctuation :: Tokenizer
  89 finalPunctuation x = E $ filter (not . T.null . unwrap) res
  90   where
  91     res :: [Either Text Text]
  92     res = case T.span Char.isPunctuation (T.reverse x) of
  93       (ps, w) | T.null ps -> [ Right $ T.reverse w ]
  94               | otherwise -> [ Right $ T.reverse w
  95                              , Right $ T.reverse ps]
  96       -- ([],w) -> [Right . T.reverse $ w]
  97       -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]
  98
  99 -- | Split off word-initial punctuation
 100 initialPunctuation :: Tokenizer
 101 initialPunctuation x = E $ filter (not . T.null . unwrap) $
 102     case T.span Char.isPunctuation x of
 103       (ps,w) | T.null ps -> [ Right w ]
 104              | otherwise -> [ Right ps
 105                             , Right w ]
 106
 107 -- | Split tokens on transitions between punctuation and
 108 -- non-punctuation characters. This tokenizer is not included in
 109 -- defaultTokenizer pipeline because dealing with word-internal
 110 -- punctuation is quite application specific.
 111 allPunctuation :: Tokenizer
 112 allPunctuation = E . map Right
 113                  . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b)
 114
 115 -- | Split words ending in n't, and freeze n't
 116 negatives :: Tokenizer
 117 negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
 118                                          , Left "n't" ]
 119             | True                   = E [ Right x ]
 120
 121 -- | Split common contractions off and freeze them.
 122 -- | Currently deals with: 'm, 's, 'd, 've, 'll
 123 contractions :: Tokenizer
 124 contractions x = case catMaybes . map (splitSuffix x) $ cts of
 125                    [] -> return x
 126                    ((w,s):_) -> E [ Right w,Left s]
 127     where cts = ["'m","'s","'d","'ve","'ll"]
 128           splitSuffix w sfx =
 129               let w' = T.reverse w
 130                   len = T.length sfx
 131               in if sfx `T.isSuffixOf` w
 132                  then Just (T.take (T.length w - len) w, T.reverse . T.take len $ w')
 133                  else Nothing
 134
 135
 136 -- | Split string on whitespace. This is just a wrapper for Data.List.words
 137 whitespace :: Tokenizer
 138 whitespace xs = E [Right w | w <- T.words xs ]
 139
 140 instance Monad (EitherList a) where
 141     return x = E [Right x]
 142     E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
 143
 144 instance Applicative (EitherList a) where
 145     pure x = return x
 146     f <*> x = f `ap` x
 147
 148 instance Functor (EitherList a) where
 149     fmap f (E xs) = E $ (fmap . fmap) f xs
 150
 151 unwrap :: Either a a -> a
 152 unwrap (Left x) = x
 153 unwrap (Right x) = x
 154
 155 examples :: [Text]
 156 examples =
 157     ["This shouldn't happen."
 158     ,"Some 'quoted' stuff"
 159     ,"This is a URL: http://example.org."
 160     ,"How about an email@example.com"
 161     ,"ReferenceError #1065 broke my debugger!"
 162     ,"I would've gone."
 163     ,"They've been there."
 164     ,"Hyphen-words"
 165     ,"Yes/No questions"
 166     ]
 167