src/Gargantext/Text/Terms.hs

   1 {-|
   2 Module      : Gargantext.Text.Ngrams
   3 Description : Ngrams definition and tools
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 An @n-gram@ is a contiguous sequence of n items from a given sample of
  11 text. In Gargantext application the items are words, n is a non negative
  12 integer.
  13
  14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
  15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
  16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
  17 "four-gram", "five-gram", and so on.
  18
  19 Source: https://en.wikipedia.org/wiki/Ngrams
  20
  21 TODO
  22 group Ngrams -> Tree
  23 compute occ by node of Tree
  24 group occs according groups
  25
  26 compute cooccurrences
  27 compute graph
  28
  29 -}
  30
  31 {-# LANGUAGE NoImplicitPrelude #-}
  32 {-# LANGUAGE OverloadedStrings #-}
  33 {-# LANGUAGE DeriveGeneric     #-}
  34 {-# LANGUAGE TemplateHaskell   #-}
  35 {-# LANGUAGE RankNTypes        #-}
  36 {-# LANGUAGE FlexibleContexts  #-}
  37 {-# LANGUAGE ConstrainedClassMethods #-}
  38
  39 module Gargantext.Text.Terms
  40   where
  41
  42 import Control.Lens
  43 import Data.Map (Map)
  44 import qualified Data.Map as Map
  45 import Data.Text (Text)
  46 import Data.Traversable
  47 import GHC.Base (String)
  48 import GHC.Generics (Generic)
  49 import Gargantext.Core
  50 import Gargantext.Core.Types
  51 import Gargantext.Core.Flow.Types
  52 import Gargantext.Prelude
  53 import Gargantext.Text (sentences, HasText(..))
  54 import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
  55 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
  56 import Gargantext.Text.Terms.Mono  (monoTerms)
  57 import Gargantext.Database.Admin.Utils (Cmd)
  58 import Gargantext.Text.Terms.Mono.Stem (stem)
  59 import Gargantext.Text.Terms.Mono.Token.En (tokenize)
  60 import Gargantext.Text.Terms.Multi (multiterms)
  61 import qualified Data.List as List
  62 import qualified Data.Set  as Set
  63 import qualified Data.Text as Text
  64
  65 data TermType lang
  66   = Mono      { _tt_lang :: !lang }
  67   | Multi     { _tt_lang :: !lang }
  68   | MonoMulti { _tt_lang :: !lang }
  69   | Unsupervised { _tt_lang  :: !lang
  70                  , _tt_windowSize :: !Int
  71                  , _tt_ngramsSize :: !Int
  72                  , _tt_model      :: !(Maybe (Tries Token ()))
  73                  }
  74   deriving Generic
  75
  76 makeLenses ''TermType
  77 --group :: [Text] -> [Text]
  78 --group = undefined
  79
  80 -- remove Stop Words
  81 -- map (filter (\t -> not . elem t)) $
  82 ------------------------------------------------------------------------
  83 -- | Sugar to extract terms from text (hiddeng mapM from end user).
  84 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  85 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
  86
  87 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
  88   where
  89     m' = case m of
  90       Just m''-> m''
  91       Nothing -> newTries n (Text.intercalate " " xs)
  92
  93 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
  94
  95
  96 ------------------------------------------------------------------------
  97 withLang :: HasText a
  98          => TermType Lang
  99          -> [DocumentWithId a]
 100          -> TermType Lang
 101 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
 102   where
 103     m' = case m of
 104       Nothing -> -- trace ("buildTries here" :: String)
 105                Just $ buildTries n ( fmap toToken
 106                                    $ uniText
 107                                    $ Text.intercalate " . "
 108                                    $ List.concat
 109                                    $ map hasText ns
 110                                    )
 111       just_m -> just_m
 112 withLang l _ = l
 113 ------------------------------------------------------------------------
 114
 115
 116 class ExtractNgramsT h
 117   where
 118     extractNgramsT :: HasText h
 119                    => TermType Lang
 120                    -> h
 121                    -> Cmd err (Map Ngrams (Map NgramsType Int))
 122
 123 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
 124                      -> Map Ngrams (Map NgramsType Int)
 125 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
 126   where
 127     filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
 128           True  -> (ng,y)
 129           False -> (Ngrams (Text.take s' t) n , y)
 130
 131
 132 -- =======================================================
 133
 134 -- | Terms from Text
 135 -- Mono : mono terms
 136 -- Multi : multi terms
 137 -- MonoMulti : mono and multi
 138 -- TODO : multi terms should exclude mono (intersection is not empty yet)
 139 terms :: TermType Lang -> Text -> IO [Terms]
 140 terms (Mono      lang) txt = pure $ monoTerms lang txt
 141 terms (Multi     lang) txt = multiterms lang txt
 142 terms (MonoMulti lang) txt = terms (Multi lang) txt
 143 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
 144   where
 145     m' = maybe (newTries n txt) identity m
 146 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 147 ------------------------------------------------------------------------
 148
 149 text2term :: Lang -> [Text] -> Terms
 150 text2term _ [] = Terms [] Set.empty
 151 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 152
 153 isPunctuation :: Text -> Bool
 154 isPunctuation x = List.elem x $  (Text.pack . pure)
 155                              <$> ("!?(),;." :: String)
 156
 157 -- | Unsupervised ngrams extraction
 158 -- language agnostic extraction
 159 -- TODO: remove IO
 160 -- TODO: newtype BlockText
 161
 162 type WindowSize = Int
 163 type MinNgramSize = Int
 164
 165 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
 166 termsUnsupervised (Unsupervised l n s m) =
 167                pure
 168              . map (text2term l)
 169              . List.nub
 170              . (List.filter (\l' -> List.length l' >= s))
 171              . List.concat
 172              . mainEleveWith (maybe (panic "no model") identity m) n
 173              . uniText
 174 termsUnsupervised _ = undefined
 175
 176 newTries :: Int -> Text -> Tries Token ()
 177 newTries n t = buildTries n (fmap toToken $ uniText t)
 178
 179 -- | TODO removing long terms > 24
 180 uniText :: Text -> [[Text]]
 181 uniText = map (List.filter (not . isPunctuation))
 182         . map tokenize
 183         . sentences       -- | TODO get sentences according to lang
 184         . Text.toLower
 185