src/Gargantext/Core/Text/Terms.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams
   3 Description : Ngrams definition and tools
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 An @n-gram@ is a contiguous sequence of n items from a given sample of
  11 text. In Gargantext application the items are words, n is a non negative
  12 integer.
  13
  14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
  15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
  16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
  17 "four-gram", "five-gram", and so on.
  18
  19 Source: https://en.wikipedia.org/wiki/Ngrams
  20
  21 TODO
  22 group Ngrams -> Tree
  23 compute occ by node of Tree
  24 group occs according groups
  25
  26 compute cooccurrences
  27 compute graph
  28
  29 -}
  30
  31 {-# LANGUAGE TemplateHaskell   #-}
  32 {-# LANGUAGE ConstrainedClassMethods #-}
  33
  34 module Gargantext.Core.Text.Terms
  35   where
  36
  37 import Control.Lens
  38 import Data.Map (Map)
  39 import qualified Data.Map as Map
  40 import Data.Text (Text)
  41 import Data.Traversable
  42 import GHC.Base (String)
  43 import GHC.Generics (Generic)
  44 import Gargantext.Core
  45 import Gargantext.Core.Types
  46 import Gargantext.Core.Flow.Types
  47 import Gargantext.Prelude
  48 import Gargantext.Core.Text (sentences, HasText(..))
  49 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
  50 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
  51 import Gargantext.Core.Text.Terms.Mono  (monoTerms)
  52 import Gargantext.Database.Prelude (Cmd)
  53 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
  54 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
  55 import Gargantext.Core.Text.Terms.Multi (multiterms)
  56 import qualified Data.List as List
  57 import qualified Data.Set  as Set
  58 import qualified Data.Text as Text
  59
  60 data TermType lang
  61   = Mono      { _tt_lang :: !lang }
  62   | Multi     { _tt_lang :: !lang }
  63   | MonoMulti { _tt_lang :: !lang }
  64   | Unsupervised { _tt_lang  :: !lang
  65                  , _tt_windowSize :: !Int
  66                  , _tt_ngramsSize :: !Int
  67                  , _tt_model      :: !(Maybe (Tries Token ()))
  68                  }
  69   deriving Generic
  70
  71 makeLenses ''TermType
  72 --group :: [Text] -> [Text]
  73 --group = undefined
  74
  75 -- remove Stop Words
  76 -- map (filter (\t -> not . elem t)) $
  77 ------------------------------------------------------------------------
  78 -- | Sugar to extract terms from text (hiddeng mapM from end user).
  79 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  80 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
  81
  82 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
  83   where
  84     m' = case m of
  85       Just m''-> m''
  86       Nothing -> newTries n (Text.intercalate " " xs)
  87
  88 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
  89
  90
  91 ------------------------------------------------------------------------
  92 withLang :: HasText a
  93          => TermType Lang
  94          -> [DocumentWithId a]
  95          -> TermType Lang
  96 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
  97   where
  98     m' = case m of
  99       Nothing -> -- trace ("buildTries here" :: String)
 100                Just $ buildTries n ( fmap toToken
 101                                    $ uniText
 102                                    $ Text.intercalate " . "
 103                                    $ List.concat
 104                                    $ map hasText ns
 105                                    )
 106       just_m -> just_m
 107 withLang l _ = l
 108
 109 ------------------------------------------------------------------------
 110 class ExtractNgramsT h
 111   where
 112     extractNgramsT :: HasText h
 113                    => TermType Lang
 114                    -> h
 115                    -> Cmd err (Map Ngrams (Map NgramsType Int))
 116
 117 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
 118                      -> Map Ngrams (Map NgramsType Int)
 119 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
 120   where
 121     filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
 122           True  -> (ng,y)
 123           False -> (Ngrams (Text.take s' t) n , y)
 124
 125
 126 -- =======================================================
 127
 128 -- | Terms from Text
 129 -- Mono : mono terms
 130 -- Multi : multi terms
 131 -- MonoMulti : mono and multi
 132 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 133 terms :: TermType Lang -> Text -> IO [Terms]
 134 terms (Mono      lang) txt = pure $ monoTerms lang txt
 135 terms (Multi     lang) txt = multiterms lang txt
 136 terms (MonoMulti lang) txt = terms (Multi lang) txt
 137 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
 138   where
 139     m' = maybe (newTries n txt) identity m
 140 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 141 ------------------------------------------------------------------------
 142
 143 text2term :: Lang -> [Text] -> Terms
 144 text2term _ [] = Terms [] Set.empty
 145 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 146
 147 isPunctuation :: Text -> Bool
 148 isPunctuation x = List.elem x $  (Text.pack . pure)
 149                              <$> ("!?(),;." :: String)
 150
 151 -- | Unsupervised ngrams extraction
 152 -- language agnostic extraction
 153 -- TODO: remove IO
 154 -- TODO: newtype BlockText
 155
 156 type WindowSize = Int
 157 type MinNgramSize = Int
 158
 159 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
 160 termsUnsupervised (Unsupervised l n s m) =
 161                pure
 162              . map (text2term l)
 163              . List.nub
 164              . (List.filter (\l' -> List.length l' >= s))
 165              . List.concat
 166              . mainEleveWith (maybe (panic "no model") identity m) n
 167              . uniText
 168 termsUnsupervised _ = undefined
 169
 170 newTries :: Int -> Text -> Tries Token ()
 171 newTries n t = buildTries n (fmap toToken $ uniText t)
 172
 173 -- | TODO removing long terms > 24
 174 uniText :: Text -> [[Text]]
 175 uniText = map (List.filter (not . isPunctuation))
 176         . map tokenize
 177         . sentences       -- TODO get sentences according to lang
 178         . Text.toLower
 179