src/Gargantext/Text/Terms.hs

   1 {-|
   2 Module      : Gargantext.Text.Ngrams
   3 Description : Ngrams definition and tools
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 An @n-gram@ is a contiguous sequence of n items from a given sample of
  11 text. In Gargantext application the items are words, n is a non negative
  12 integer.
  13
  14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
  15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
  16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
  17 "four-gram", "five-gram", and so on.
  18
  19 Source: https://en.wikipedia.org/wiki/Ngrams
  20
  21 TODO
  22 group Ngrams -> Tree
  23 compute occ by node of Tree
  24 group occs according groups
  25
  26 compute cooccurrences
  27 compute graph
  28
  29 -}
  30
  31 {-# LANGUAGE NoImplicitPrelude #-}
  32 {-# LANGUAGE OverloadedStrings #-}
  33 {-# LANGUAGE DeriveGeneric     #-}
  34 {-# LANGUAGE TemplateHaskell   #-}
  35
  36 module Gargantext.Text.Terms
  37   where
  38
  39 import Control.Lens
  40 import Data.Text (Text)
  41 import Data.Traversable
  42 import GHC.Base (String)
  43 import GHC.Generics (Generic)
  44 import Gargantext.Core
  45 import Gargantext.Core.Types
  46 import Gargantext.Prelude
  47 import Gargantext.Text (sentences)
  48 import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
  49 import Gargantext.Text.Terms.Mono  (monoTerms)
  50 import Gargantext.Text.Terms.Mono.Stem (stem)
  51 import Gargantext.Text.Terms.Mono.Token.En (tokenize)
  52 import Gargantext.Text.Terms.Multi (multiterms)
  53 import qualified Data.List as List
  54 import qualified Data.Set  as Set
  55 import qualified Data.Text as Text
  56
  57 data TermType lang
  58   = Mono      { _tt_lang :: lang }
  59   | Multi     { _tt_lang :: lang }
  60   | MonoMulti { _tt_lang :: lang }
  61   | Unsupervised { _tt_lang  :: lang
  62                  , _tt_windowSize :: Int
  63                  , _tt_ngramsSize :: Int
  64                  , _tt_model      :: Maybe (Tries Token ())
  65                  }
  66   deriving Generic
  67
  68 makeLenses ''TermType
  69 --group :: [Text] -> [Text]
  70 --group = undefined
  71
  72 -- remove Stop Words
  73 -- map (filter (\t -> not . elem t)) $
  74 ------------------------------------------------------------------------
  75 -- | Sugar to extract terms from text (hiddeng mapM from end user).
  76 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  77 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
  78
  79 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
  80   where
  81     m' = case m of
  82       Just m''-> m''
  83       Nothing -> newTries n (Text.intercalate " " xs)
  84
  85 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
  86
  87 ------------------------------------------------------------------------
  88 -- | Terms from Text
  89 -- Mono : mono terms
  90 -- Multi : multi terms
  91 -- MonoMulti : mono and multi
  92 -- TODO : multi terms should exclude mono (intersection is not empty yet)
  93 terms :: TermType Lang -> Text -> IO [Terms]
  94 terms (Mono      lang) txt = pure $ monoTerms lang txt
  95 terms (Multi     lang) txt = multiterms lang txt
  96 terms (MonoMulti lang) txt = terms (Multi lang) txt
  97 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
  98   where
  99     m' = maybe (newTries n txt) identity m
 100 -- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
 101 ------------------------------------------------------------------------
 102
 103 text2term :: Lang -> [Text] -> Terms
 104 text2term _ [] = Terms [] Set.empty
 105 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
 106
 107 isPunctuation :: Text -> Bool
 108 isPunctuation x = List.elem x $  (Text.pack . pure)
 109                              <$> ("!?(),;." :: String)
 110
 111 -- | Unsupervised ngrams extraction
 112 -- language agnostic extraction
 113 -- TODO: remove IO
 114 -- TODO: newtype BlockText
 115
 116 type WindowSize = Int
 117 type MinNgramSize = Int
 118
 119 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
 120 termsUnsupervised (Unsupervised l n s m) =
 121                pure
 122              . map (text2term l)
 123              . List.nub
 124              . (List.filter (\l' -> List.length l' >= s))
 125              . List.concat
 126              . mainEleveWith (maybe (panic "no model") identity m) n
 127              . uniText
 128 termsUnsupervised _ = undefined
 129
 130 newTries :: Int -> Text -> Tries Token ()
 131 newTries n t = buildTries n (fmap toToken $ uniText t)
 132
 133 -- | TODO removing long terms > 24
 134 uniText :: Text -> [[Text]]
 135 uniText = map (List.filter (not . isPunctuation))
 136         . map tokenize
 137         . sentences       -- | TODO get sentences according to lang
 138         . Text.toLower
 139