src/Gargantext/Core/Text/Terms.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams
   3 Description : Ngrams definition and tools
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 An @n-gram@ is a contiguous sequence of n items from a given sample of
  11 text. In Gargantext application the items are words, n is a non negative
  12 integer.
  13
  14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
  15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
  16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
  17 "four-gram", "five-gram", and so on.
  18
  19 Source: https://en.wikipedia.org/wiki/Ngrams
  20
  21 TODO
  22 group Ngrams -> Tree
  23 compute occ by node of Tree
  24 group occs according groups
  25
  26 compute cooccurrences
  27 compute graph
  28
  29 -}
  30
  31 {-# LANGUAGE TemplateHaskell   #-}
  32 {-# LANGUAGE ConstrainedClassMethods #-}
  33
  34 module Gargantext.Core.Text.Terms
  35   where
  36
  37 import Control.Lens
  38 import Data.Map (Map)
  39 import qualified Data.Map as Map
  40 import Data.Text (Text)
  41 import Data.Traversable
  42 import qualified Data.List as List
  43 import qualified Data.Set  as Set
  44 import qualified Data.Text as Text
  45 import GHC.Base (String)
  46 import GHC.Generics (Generic)
  47
  48 import Gargantext.Core
  49 import Gargantext.Core.Flow.Types
  50 import Gargantext.Core.Text (sentences, HasText(..))
  51 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
  52 import Gargantext.Core.Text.Terms.Mono  (monoTerms)
  53 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
  54 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
  55 import Gargantext.Core.Text.Terms.Multi (multiterms)
  56 import Gargantext.Core.Types
  57 import Gargantext.Database.Prelude (Cmd)
  58 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams)
  59 import Gargantext.Prelude
  60
  61
  62 data TermType lang
  63   = Mono      { _tt_lang :: !lang }
  64   | Multi     { _tt_lang :: !lang }
  65   | MonoMulti { _tt_lang :: !lang }
  66   | Unsupervised { _tt_lang  :: !lang
  67                  , _tt_windowSize :: !Int
  68                  , _tt_ngramsSize :: !Int
  69                  , _tt_model      :: !(Maybe (Tries Token ()))
  70                  }
  71   deriving Generic
  72
  73 makeLenses ''TermType
  74 --group :: [Text] -> [Text]
  75 --group = undefined
  76
  77 -- remove Stop Words
  78 -- map (filter (\t -> not . elem t)) $
  79 ------------------------------------------------------------------------
  80 -- | Sugar to extract terms from text (hiddeng mapM from end user).
  81 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  82 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
  83
  84 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
  85   where
  86     m' = case m of
  87       Just m''-> m''
  88       Nothing -> newTries n (Text.intercalate " " xs)
  89
  90 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
  91
  92
  93 ------------------------------------------------------------------------
  94 withLang :: HasText a
  95          => TermType Lang
  96          -> [DocumentWithId a]
  97          -> TermType Lang
  98 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
  99   where
 100     m' = case m of
 101       Nothing -> -- trace ("buildTries here" :: String)
 102                Just $ buildTries n ( fmap toToken
 103                                    $ uniText
 104                                    $ Text.intercalate " . "
 105                                    $ List.concat
 106                                    $ map hasText ns
 107                                    )
 108       just_m -> just_m
 109 withLang l _ = l
 110
 111 ------------------------------------------------------------------------
 112 class ExtractNgramsT h
 113   where
 114     extractNgramsT :: HasText h
 115                    => TermType Lang
 116                    -> h
 117                    -> Cmd err (Map Ngrams (Map NgramsType Int))
 118
 119 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
 120                      -> Map Ngrams (Map NgramsType Int)
 121 filterNgramsT s ms = Map.fromList $ map filter' $ Map.toList ms
 122   where
 123     filter' (ng,y)
 124       | Text.length (ng ^. ngramsTerms) < s = (ng,y)
 125       | otherwise                           = (text2ngrams (Text.take s (ng ^. ngramsTerms)), y)
 126
 127
 128 -- =======================================================
 129
 130 -- | Terms from Text
 131 -- Mono : mono terms
 132 -- Multi : multi terms
 133 -- MonoMulti : mono and multi
 134 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 135 terms :: TermType Lang -> Text -> IO [Terms]
 136 terms (Mono      lang) txt = pure $ monoTerms lang txt
 137 terms (Multi     lang) txt = multiterms lang txt
 138 terms (MonoMulti lang) txt = terms (Multi lang) txt
 139 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
 140   where
 141     m' = maybe (newTries n txt) identity m
 142 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 143 ------------------------------------------------------------------------
 144
 145 text2term :: Lang -> [Text] -> Terms
 146 text2term _ [] = Terms [] Set.empty
 147 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 148
 149 isPunctuation :: Text -> Bool
 150 isPunctuation x = List.elem x $  (Text.pack . pure)
 151                              <$> ("!?(),;." :: String)
 152
 153 -- | Unsupervised ngrams extraction
 154 -- language agnostic extraction
 155 -- TODO: remove IO
 156 -- TODO: newtype BlockText
 157
 158 type WindowSize = Int
 159 type MinNgramSize = Int
 160
 161 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
 162 termsUnsupervised (Unsupervised l n s m) =
 163                pure
 164              . map (text2term l)
 165              . List.nub
 166              . (List.filter (\l' -> List.length l' >= s))
 167              . List.concat
 168              . mainEleveWith (maybe (panic "no model") identity m) n
 169              . uniText
 170 termsUnsupervised _ = undefined
 171
 172 newTries :: Int -> Text -> Tries Token ()
 173 newTries n t = buildTries n (fmap toToken $ uniText t)
 174
 175 -- | TODO removing long terms > 24
 176 uniText :: Text -> [[Text]]
 177 uniText = map (List.filter (not . isPunctuation))
 178         . map tokenize
 179         . sentences       -- TODO get sentences according to lang
 180         . Text.toLower
 181