src/Gargantext/Core/Text/Terms.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Ngrams
   3 Description : Ngrams definition and tools
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 An @n-gram@ is a contiguous sequence of n items from a given sample of
  11 text. In Gargantext application the items are words, n is a non negative
  12 integer.
  13
  14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
  15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
  16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
  17 "four-gram", "five-gram", and so on.
  18
  19 Source: https://en.wikipedia.org/wiki/Ngrams
  20
  21 TODO
  22 group Ngrams -> Tree
  23 compute occ by node of Tree
  24 group occs according groups
  25
  26 compute cooccurrences
  27 compute graph
  28
  29 -}
  30
  31 {-# LANGUAGE TemplateHaskell   #-}
  32 {-# LANGUAGE ConstrainedClassMethods #-}
  33
  34 module Gargantext.Core.Text.Terms
  35   where
  36
  37 import Control.Lens
  38 import Data.Map (Map)
  39 import qualified Data.Map as Map
  40 import Data.Text (Text)
  41 import Data.Traversable
  42 import GHC.Base (String)
  43 import GHC.Generics (Generic)
  44 import Gargantext.Core
  45 import Gargantext.Core.Types
  46 import Gargantext.Core.Flow.Types
  47 import Gargantext.Prelude
  48 import Gargantext.Core.Text (sentences, HasText(..))
  49 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
  50 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
  51 import Gargantext.Core.Text.Terms.Mono  (monoTerms)
  52 import Gargantext.Database.Prelude (Cmd)
  53 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
  54 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
  55 import Gargantext.Core.Text.Terms.Multi (multiterms)
  56 import qualified Data.List as List
  57 import qualified Data.Set  as Set
  58 import qualified Data.Text as Text
  59
  60
  61 data TermType lang
  62   = Mono      { _tt_lang :: !lang }
  63   | Multi     { _tt_lang :: !lang }
  64   | MonoMulti { _tt_lang :: !lang }
  65   | Unsupervised { _tt_lang  :: !lang
  66                  , _tt_windowSize :: !Int
  67                  , _tt_ngramsSize :: !Int
  68                  , _tt_model      :: !(Maybe (Tries Token ()))
  69                  }
  70   deriving Generic
  71
  72 makeLenses ''TermType
  73 --group :: [Text] -> [Text]
  74 --group = undefined
  75
  76 -- remove Stop Words
  77 -- map (filter (\t -> not . elem t)) $
  78 ------------------------------------------------------------------------
  79 -- | Sugar to extract terms from text (hiddeng mapM from end user).
  80 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  81 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
  82
  83 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
  84   where
  85     m' = case m of
  86       Just m''-> m''
  87       Nothing -> newTries n (Text.intercalate " " xs)
  88
  89 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
  90
  91
  92 ------------------------------------------------------------------------
  93 withLang :: HasText a
  94          => TermType Lang
  95          -> [DocumentWithId a]
  96          -> TermType Lang
  97 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
  98   where
  99     m' = case m of
 100       Nothing -> -- trace ("buildTries here" :: String)
 101                Just $ buildTries n ( fmap toToken
 102                                    $ uniText
 103                                    $ Text.intercalate " . "
 104                                    $ List.concat
 105                                    $ map hasText ns
 106                                    )
 107       just_m -> just_m
 108 withLang l _ = l
 109
 110 ------------------------------------------------------------------------
 111 class ExtractNgramsT h
 112   where
 113     extractNgramsT :: HasText h
 114                    => TermType Lang
 115                    -> h
 116                    -> Cmd err (Map Ngrams (Map NgramsType Int))
 117
 118 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
 119                      -> Map Ngrams (Map NgramsType Int)
 120 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
 121   where
 122     filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
 123           True  -> (ng,y)
 124           False -> (Ngrams (Text.take s' t) n , y)
 125
 126
 127 -- =======================================================
 128
 129 -- | Terms from Text
 130 -- Mono : mono terms
 131 -- Multi : multi terms
 132 -- MonoMulti : mono and multi
 133 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 134 terms :: TermType Lang -> Text -> IO [Terms]
 135 terms (Mono      lang) txt = pure $ monoTerms lang txt
 136 terms (Multi     lang) txt = multiterms lang txt
 137 terms (MonoMulti lang) txt = terms (Multi lang) txt
 138 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
 139   where
 140     m' = maybe (newTries n txt) identity m
 141 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 142 ------------------------------------------------------------------------
 143
 144 text2term :: Lang -> [Text] -> Terms
 145 text2term _ [] = Terms [] Set.empty
 146 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 147
 148 isPunctuation :: Text -> Bool
 149 isPunctuation x = List.elem x $  (Text.pack . pure)
 150                              <$> ("!?(),;." :: String)
 151
 152 -- | Unsupervised ngrams extraction
 153 -- language agnostic extraction
 154 -- TODO: remove IO
 155 -- TODO: newtype BlockText
 156
 157 type WindowSize = Int
 158 type MinNgramSize = Int
 159
 160 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
 161 termsUnsupervised (Unsupervised l n s m) =
 162                pure
 163              . map (text2term l)
 164              . List.nub
 165              . (List.filter (\l' -> List.length l' >= s))
 166              . List.concat
 167              . mainEleveWith (maybe (panic "no model") identity m) n
 168              . uniText
 169 termsUnsupervised _ = undefined
 170
 171 newTries :: Int -> Text -> Tries Token ()
 172 newTries n t = buildTries n (fmap toToken $ uniText t)
 173
 174 -- | TODO removing long terms > 24
 175 uniText :: Text -> [[Text]]
 176 uniText = map (List.filter (not . isPunctuation))
 177         . map tokenize
 178         . sentences       -- TODO get sentences according to lang
 179         . Text.toLower
 180