src/Gargantext/Core/Text/Terms.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams
   3 Description : Ngrams definition and tools
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 An @n-gram@ is a contiguous sequence of n items from a given sample of
  11 text. In Gargantext application the items are words, n is a non negative
  12 integer.
  13
  14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
  15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
  16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
  17 "four-gram", "five-gram", and so on.
  18
  19 Source: https://en.wikipedia.org/wiki/Ngrams
  20
  21 TODO
  22 group Ngrams -> Tree
  23 compute occ by node of Tree
  24 group occs according groups
  25
  26 compute cooccurrences
  27 compute graph
  28
  29 -}
  30
  31 {-# LANGUAGE TemplateHaskell   #-}
  32 {-# LANGUAGE ConstrainedClassMethods #-}
  33
  34 module Gargantext.Core.Text.Terms
  35   where
  36
  37 import Control.Lens
  38 import Data.Map (Map)
  39 import qualified Data.Map as Map
  40 import Data.Text (Text)
  41 import Data.Traversable
  42 import qualified Data.List as List
  43 import qualified Data.Set  as Set
  44 import qualified Data.Text as Text
  45 import GHC.Base (String)
  46 import GHC.Generics (Generic)
  47
  48 import Gargantext.Core
  49 import Gargantext.Core.Flow.Types
  50 import Gargantext.Core.Text (sentences, HasText(..))
  51 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
  52 import Gargantext.Core.Text.Terms.Mono  (monoTerms)
  53 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
  54 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
  55 import Gargantext.Core.Text.Terms.Multi (multiterms)
  56 import Gargantext.Core.Types
  57 import Gargantext.Database.Prelude (Cmd)
  58 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams)
  59 import Gargantext.Prelude
  60
  61
  62 data TermType lang
  63   = Mono      { _tt_lang :: !lang }
  64   | Multi     { _tt_lang :: !lang }
  65   | MonoMulti { _tt_lang :: !lang }
  66   | Unsupervised { _tt_lang       :: !lang
  67                  , _tt_windowSize :: !Int
  68                  , _tt_ngramsSize :: !Int
  69                  , _tt_model      :: !(Maybe (Tries Token ()))
  70                  }
  71   deriving Generic
  72
  73 makeLenses ''TermType
  74 --group :: [Text] -> [Text]
  75 --group = undefined
  76
  77 -- remove Stop Words
  78 -- map (filter (\t -> not . elem t)) $
  79 ------------------------------------------------------------------------
  80 -- | Sugar to extract terms from text (hiddeng mapM from end user).
  81 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  82 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
  83
  84 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
  85   where
  86     m' = case m of
  87       Just m''-> m''
  88       Nothing -> newTries n (Text.intercalate " " xs)
  89
  90 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
  91
  92
  93 ------------------------------------------------------------------------
  94 withLang :: HasText a
  95          => TermType Lang
  96          -> [DocumentWithId a]
  97          -> TermType Lang
  98 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
  99   where
 100     m' = case m of
 101       Nothing -> -- trace ("buildTries here" :: String)
 102                Just $ buildTries n ( fmap toToken
 103                                    $ uniText
 104                                    $ Text.intercalate " . "
 105                                    $ List.concat
 106                                    $ map hasText ns
 107                                    )
 108       just_m -> just_m
 109 withLang l _ = l
 110
 111 ------------------------------------------------------------------------
 112 class ExtractNgramsT h
 113   where
 114     extractNgramsT :: HasText h
 115                    => TermType Lang
 116                    -> h
 117                    -> Cmd err (Map Ngrams (Map NgramsType Int))
 118
 119
 120
 121 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
 122                      -> Map Ngrams (Map NgramsType Int)
 123 filterNgramsT s ms = Map.fromList $ map filter' $ Map.toList ms
 124   where
 125     filter' (ng,y)
 126       | Text.length (ng ^. ngramsTerms) < s = (ng,y)
 127       | otherwise                           = (text2ngrams (Text.take s (ng ^. ngramsTerms)), y)
 128
 129
 130 -- =======================================================
 131
 132 -- | Terms from Text
 133 -- Mono : mono terms
 134 -- Multi : multi terms
 135 -- MonoMulti : mono and multi
 136 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 137 terms :: TermType Lang -> Text -> IO [Terms]
 138 terms (Mono      lang) txt = pure $ monoTerms lang txt
 139 terms (Multi     lang) txt = multiterms lang txt
 140 terms (MonoMulti lang) txt = terms (Multi lang) txt
 141 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
 142   where
 143     m' = maybe (newTries n txt) identity m
 144 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 145
 146
 147 ------------------------------------------------------------------------
 148
 149 text2term :: Lang -> [Text] -> Terms
 150 text2term _ [] = Terms [] Set.empty
 151 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 152
 153 isPunctuation :: Text -> Bool
 154 isPunctuation x = List.elem x $  (Text.pack . pure)
 155                              <$> ("!?(),;." :: String)
 156
 157 -- | Unsupervised ngrams extraction
 158 -- language agnostic extraction
 159 -- TODO: remove IO
 160 -- TODO: newtype BlockText
 161
 162 type WindowSize = Int
 163 type MinNgramSize = Int
 164
 165 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
 166 termsUnsupervised (Unsupervised l n s m) =
 167                pure
 168              . map (text2term l)
 169              . List.nub
 170              . (List.filter (\l' -> List.length l' >= s))
 171              . List.concat
 172              . mainEleveWith (maybe (panic "no model") identity m) n
 173              . uniText
 174 termsUnsupervised _ = undefined
 175
 176 newTries :: Int -> Text -> Tries Token ()
 177 newTries n t = buildTries n (fmap toToken $ uniText t)
 178
 179 -- | TODO removing long terms > 24
 180 uniText :: Text -> [[Text]]
 181 uniText = map (List.filter (not . isPunctuation))
 182         . map tokenize
 183         . sentences       -- TODO get sentences according to lang
 184         . Text.toLower
 185