{-| Module : Gargantext.Text.Ngrams Description : Ngrams definition and tools Copyright : (c) CNRS, 2017 - present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX An @n-gram@ is a contiguous sequence of n items from a given sample of text. In Gargantext application the items are words, n is a non negative integer. Using Latin numerical prefixes, an n-gram of size 1 is referred to as a "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size 3 is a "trigram". English cardinal numbers are sometimes used, e.g., "four-gram", "five-gram", and so on. Source: https://en.wikipedia.org/wiki/Ngrams TODO group Ngrams -> Tree compute occ by node of Tree group occs according groups compute cooccurrences compute graph -} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE TemplateHaskell #-} module Gargantext.Text.Terms where import Control.Lens import Data.Text (Text) import Data.Traversable import GHC.Base (String) import Gargantext.Prelude import Gargantext.Core import Gargantext.Core.Types import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Mono (monoTerms) import qualified Data.List as List import qualified Data.Text as Text import Gargantext.Text (sentences) import Gargantext.Text.Terms.Mono.Token.En (tokenize) import Gargantext.Text.Terms.Eleve (mainEleve) data TermType lang = Mono { _tt_lang :: lang } | Multi { _tt_lang :: lang } | MonoMulti { _tt_lang :: lang } makeLenses ''TermType --group :: [Text] -> [Text] --group = undefined -- remove Stop Words -- map (filter (\t -> not . elem t)) $ ------------------------------------------------------------------------ -- | Sugar to extract terms from text (hiddeng mapM from end user). --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms]) extractTerms :: TermType Lang -> [Text] -> IO [[Terms]] extractTerms termTypeLang = mapM (terms termTypeLang) ------------------------------------------------------------------------ -- | Terms from Text -- Mono : mono terms -- Multi : multi terms -- MonoMulti : mono and multi -- TODO : multi terms should exclude mono (intersection is not empty yet) terms :: TermType Lang -> Text -> IO [Terms] terms (Mono lang) txt = pure $ monoTerms lang txt terms (Multi lang) txt = multiterms lang txt terms (MonoMulti lang) txt = terms (Multi lang) txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt ------------------------------------------------------------------------ isPunctuation :: Text -> Bool isPunctuation x = List.elem x $ (Text.pack . pure) <$> ("!?(),;." :: String) -- | Unsupervised ngrams extraction -- language agnostic extraction -- TODO: remove IO -- TODO: newtype BlockText extractTermsUnsupervised :: Int -> Text -> [[Text]] extractTermsUnsupervised n = List.nub . (List.filter (\l -> List.length l > 1)) . List.concat . mainEleve n . map (map Text.toLower) . map (List.filter (not . isPunctuation)) . map tokenize . sentences