2 Module : Gargantext.Text.Ngrams
3 Description : Ngrams definition and tools
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 An @n-gram@ is a contiguous sequence of n items from a given sample of
11 text. In Gargantext application the items are words, n is a non negative
14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
17 "four-gram", "five-gram", and so on.
19 Source: https://en.wikipedia.org/wiki/Ngrams
23 compute occ by node of Tree
24 group occs according groups
31 {-# LANGUAGE NoImplicitPrelude #-}
32 {-# LANGUAGE OverloadedStrings #-}
33 {-# LANGUAGE DeriveGeneric #-}
34 {-# LANGUAGE TemplateHaskell #-}
36 module Gargantext.Text.Terms
40 import Data.Aeson.TH (deriveJSON)
42 import Data.Text (Text)
43 import Data.Traversable
44 import GHC.Base (String)
45 import GHC.Generics (Generic)
46 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
47 import Gargantext.Core
48 import Gargantext.Core.Types
49 import Gargantext.Prelude
50 import Gargantext.Text (sentences)
51 import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
52 import Gargantext.Text.Terms.Mono (monoTerms)
53 import Gargantext.Text.Terms.Mono.Stem (stem)
54 import Gargantext.Text.Terms.Mono.Token.En (tokenize)
55 import Gargantext.Text.Terms.Multi (multiterms)
56 import qualified Data.List as List
57 import qualified Data.Set as Set
58 import qualified Data.Text as Text
61 = Mono { _tt_lang :: lang }
62 | Multi { _tt_lang :: lang }
63 | MonoMulti { _tt_lang :: lang }
64 | Unsupervised { _tt_lang :: lang
65 , _tt_windowSize :: Int
66 , _tt_ngramsSize :: Int
67 , _tt_model :: Maybe (Tries Token ())
72 --group :: [Text] -> [Text]
76 -- map (filter (\t -> not . elem t)) $
77 ------------------------------------------------------------------------
78 -- | Sugar to extract terms from text (hiddeng mapM from end user).
79 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
80 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
82 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
86 Nothing -> newTries n (Text.intercalate " " xs)
88 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
90 ------------------------------------------------------------------------
93 -- Multi : multi terms
94 -- MonoMulti : mono and multi
95 -- TODO : multi terms should exclude mono (intersection is not empty yet)
96 terms :: TermType Lang -> Text -> IO [Terms]
97 terms (Mono lang) txt = pure $ monoTerms lang txt
98 terms (Multi lang) txt = multiterms lang txt
99 terms (MonoMulti lang) txt = terms (Multi lang) txt
100 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
102 m' = maybe (newTries n txt) identity m
103 -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
104 ------------------------------------------------------------------------
106 text2term :: Lang -> [Text] -> Terms
107 text2term _ [] = Terms [] Set.empty
108 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
110 isPunctuation :: Text -> Bool
111 isPunctuation x = List.elem x $ (Text.pack . pure)
112 <$> ("!?(),;." :: String)
114 -- | Unsupervised ngrams extraction
115 -- language agnostic extraction
117 -- TODO: newtype BlockText
119 type WindowSize = Int
120 type MinNgramSize = Int
122 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
123 termsUnsupervised (Unsupervised l n s m) =
127 . (List.filter (\l' -> List.length l' >= s))
129 . mainEleveWith (maybe (panic "no model") identity m) n
131 termsUnsupervised _ = undefined
133 newTries :: Int -> Text -> Tries Token ()
134 newTries n t = buildTries n (fmap toToken $ uniText t)
136 -- | TODO removing long terms > 24
137 uniText :: Text -> [[Text]]
138 uniText = map (List.filter (not . isPunctuation))
140 . sentences -- | TODO get sentences according to lang