src/Gargantext/Core/Text/Terms.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams
   3 Description : Ngrams definition and tools
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 An @n-gram@ is a contiguous sequence of n items from a given sample of
  11 text. In Gargantext application the items are words, n is a non negative
  12 integer.
  13
  14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
  15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
  16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
  17 "four-gram", "five-gram", and so on.
  18
  19 Source: https://en.wikipedia.org/wiki/Ngrams
  20
  21 TODO
  22 group Ngrams -> Tree
  23 compute occ by node of Tree
  24 group occs according groups
  25
  26 compute cooccurrences
  27 compute graph
  28
  29 -}
  30
  31 {-# LANGUAGE TemplateHaskell   #-}
  32 {-# LANGUAGE ConstrainedClassMethods #-}
  33
  34 module Gargantext.Core.Text.Terms
  35   where
  36
  37 import Control.Lens
  38 import Data.Map (Map)
  39 import qualified Data.Map as Map
  40 import Data.Text (Text)
  41 import Data.Traversable
  42 import qualified Data.List as List
  43 import qualified Data.Set  as Set
  44 import qualified Data.Text as Text
  45 import GHC.Base (String)
  46 import GHC.Generics (Generic)
  47
  48 import Gargantext.Core
  49 import Gargantext.Core.Flow.Types
  50 import Gargantext.Core.Text (sentences, HasText(..))
  51 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
  52 import Gargantext.Core.Text.Terms.Mono  (monoTerms)
  53 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
  54 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
  55 import Gargantext.Core.Text.Terms.Multi (multiterms)
  56 import Gargantext.Core.Types
  57 import Gargantext.Database.Prelude (Cmd)
  58 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams)
  59 import Gargantext.Prelude
  60
  61
  62 data TermType lang
  63   = Mono      { _tt_lang :: !lang }
  64   | Multi     { _tt_lang :: !lang }
  65   | MonoMulti { _tt_lang :: !lang }
  66   | Unsupervised { _tt_lang       :: !lang
  67                  , _tt_windowSize :: !Int
  68                  , _tt_ngramsSize :: !Int
  69                  , _tt_model      :: !(Maybe (Tries Token ()))
  70                  }
  71   deriving Generic
  72
  73 makeLenses ''TermType
  74 --group :: [Text] -> [Text]
  75 --group = undefined
  76
  77 -- remove Stop Words
  78 -- map (filter (\t -> not . elem t)) $
  79 ------------------------------------------------------------------------
  80 -- | Sugar to extract terms from text (hiddeng mapM from end user).
  81 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  82 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
  83
  84 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
  85   where
  86     m' = case m of
  87       Just m''-> m''
  88       Nothing -> newTries n (Text.intercalate " " xs)
  89
  90 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
  91
  92
  93 ------------------------------------------------------------------------
  94 withLang :: HasText a
  95          => TermType Lang
  96          -> [DocumentWithId a]
  97          -> TermType Lang
  98 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
  99   where
 100     m' = case m of
 101       Nothing -> -- trace ("buildTries here" :: String)
 102                Just $ buildTries n ( fmap toToken
 103                                    $ uniText
 104                                    $ Text.intercalate " . "
 105                                    $ List.concat
 106                                    $ map hasText ns
 107                                    )
 108       just_m -> just_m
 109 withLang l _ = l
 110
 111 ------------------------------------------------------------------------
 112 class ExtractNgramsT h
 113   where
 114     extractNgramsT :: HasText h
 115                    => TermType Lang
 116                    -> h
 117                    -> Cmd err (Map Ngrams (Map NgramsType Int))
 118
 119 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
 120                      -> Map Ngrams (Map NgramsType Int)
 121 filterNgramsT s ms = Map.fromList $ map filter' $ Map.toList ms
 122   where
 123     filter' (ng,y)
 124       | Text.length (ng ^. ngramsTerms) < s = (ng,y)
 125       | otherwise                           = (text2ngrams (Text.take s (ng ^. ngramsTerms)), y)
 126
 127
 128 -- =======================================================
 129
 130 -- | Terms from Text
 131 -- Mono : mono terms
 132 -- Multi : multi terms
 133 -- MonoMulti : mono and multi
 134 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 135 terms :: TermType Lang -> Text -> IO [Terms]
 136 terms (Mono      lang) txt = pure $ monoTerms lang txt
 137 terms (Multi     lang) txt = multiterms lang txt
 138 terms (MonoMulti lang) txt = terms (Multi lang) txt
 139 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
 140   where
 141     m' = maybe (newTries n txt) identity m
 142 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 143
 144
 145 ------------------------------------------------------------------------
 146
 147 text2term :: Lang -> [Text] -> Terms
 148 text2term _ [] = Terms [] Set.empty
 149 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 150
 151 isPunctuation :: Text -> Bool
 152 isPunctuation x = List.elem x $  (Text.pack . pure)
 153                              <$> ("!?(),;." :: String)
 154
 155 -- | Unsupervised ngrams extraction
 156 -- language agnostic extraction
 157 -- TODO: remove IO
 158 -- TODO: newtype BlockText
 159
 160 type WindowSize = Int
 161 type MinNgramSize = Int
 162
 163 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
 164 termsUnsupervised (Unsupervised l n s m) =
 165                pure
 166              . map (text2term l)
 167              . List.nub
 168              . (List.filter (\l' -> List.length l' >= s))
 169              . List.concat
 170              . mainEleveWith (maybe (panic "no model") identity m) n
 171              . uniText
 172 termsUnsupervised _ = undefined
 173
 174 newTries :: Int -> Text -> Tries Token ()
 175 newTries n t = buildTries n (fmap toToken $ uniText t)
 176
 177 -- | TODO removing long terms > 24
 178 uniText :: Text -> [[Text]]
 179 uniText = map (List.filter (not . isPunctuation))
 180         . map tokenize
 181         . sentences       -- TODO get sentences according to lang
 182         . Text.toLower
 183