2 Module : Gargantext.Core.Text.Ngrams
3 Description : Ngrams definition and tools
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 An @n-gram@ is a contiguous sequence of n items from a given sample of
11 text. In Gargantext application the items are words, n is a non negative
14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
17 "four-gram", "five-gram", and so on.
19 Source: https://en.wikipedia.org/wiki/Ngrams
23 compute occ by node of Tree
24 group occs according groups
31 {-# LANGUAGE ConstrainedClassMethods #-}
32 {-# LANGUAGE StandaloneDeriving #-}
33 {-# LANGUAGE TemplateHaskell #-}
35 module Gargantext.Core.Text.Terms
39 import Data.HashMap.Strict (HashMap)
40 import Data.Hashable (Hashable)
42 import Data.Text (Text)
43 import Data.Traversable
44 import GHC.Base (String)
45 import GHC.Generics (Generic)
46 import qualified Data.List as List
47 import qualified Data.Set as Set
48 import qualified Data.Text as Text
49 import qualified Data.HashMap.Strict as HashMap
50 import Gargantext.Core
51 import Gargantext.Core.Utils (groupWithCounts)
52 import Gargantext.Core.Text (sentences, HasText(..))
53 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
54 import Gargantext.Core.Text.Terms.Mono (monoTerms)
55 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
56 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
57 import Gargantext.Core.Text.Terms.Multi (multiterms)
58 import Gargantext.Core.Types
59 import Gargantext.Database.Prelude (Cmd)
60 import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
61 import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag, np_form, np_lem)
62 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams, NgramsId)
63 import Gargantext.Prelude
66 = Mono { _tt_lang :: !lang }
67 | Multi { _tt_lang :: !lang }
68 | MonoMulti { _tt_lang :: !lang }
69 | Unsupervised { _tt_lang :: !lang
70 , _tt_windowSize :: !Int
71 , _tt_ngramsSize :: !Int
72 , _tt_model :: !(Maybe (Tries Token ()))
75 deriving instance (Show lang) => Show (TermType lang)
78 --group :: [Text] -> [Text]
82 -- map (filter (\t -> not . elem t)) $
83 ------------------------------------------------------------------------
84 -- | Sugar to extract terms from text (hidding 'mapM' from end user).
85 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
86 extractTerms :: TermType Lang -> [Text] -> IO [[TermsWithCount]]
87 extractTerms (Unsupervised {..}) xs = mapM (terms (Unsupervised { _tt_model = Just m', .. })) xs
89 m' = case _tt_model of
91 Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
92 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
95 ------------------------------------------------------------------------
96 withLang :: (Foldable t, Functor t, HasText h)
100 withLang (Unsupervised {..}) ns = Unsupervised { _tt_model = m', .. }
102 m' = case _tt_model of
103 Nothing -> -- trace ("buildTries here" :: String)
104 Just $ buildTries _tt_ngramsSize
107 $ Text.intercalate " . "
113 ------------------------------------------------------------------------
114 data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
115 | EnrichedNgrams { unEnrichedNgrams :: NgramsPostag }
116 deriving (Eq, Ord, Generic, Show)
118 instance Hashable ExtractedNgrams
120 -- | A typeclass that represents extracting ngrams from an entity.
121 class ExtractNgramsT h
123 extractNgramsT :: HasText h
126 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
127 ------------------------------------------------------------------------
128 enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
129 enrichedTerms l pa po (Terms ng1 ng2) =
130 NgramsPostag l pa po form lem
132 form = text2ngrams $ Text.intercalate " " ng1
133 lem = text2ngrams $ Text.intercalate " " $ Set.toList ng2
135 ------------------------------------------------------------------------
136 cleanNgrams :: Int -> Ngrams -> Ngrams
138 | Text.length (ng ^. ngramsTerms) < s = ng
139 | otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
141 cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
142 cleanExtractedNgrams s (SimpleNgrams ng) = SimpleNgrams $ (cleanNgrams s) ng
143 cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s)
144 $ over np_lem (cleanNgrams s) ng
146 extracted2ngrams :: ExtractedNgrams -> Ngrams
147 extracted2ngrams (SimpleNgrams ng) = ng
148 extracted2ngrams (EnrichedNgrams ng) = view np_form ng
150 ---------------------------
151 insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
152 insertExtractedNgrams ngs = do
153 let (s, e) = List.partition isSimpleNgrams ngs
154 m1 <- insertNgrams (map unSimpleNgrams s)
155 --printDebug "others" m1
157 m2 <- insertNgramsPostag (map unEnrichedNgrams e)
158 --printDebug "terms" m2
160 let result = HashMap.union m1 m2
163 isSimpleNgrams :: ExtractedNgrams -> Bool
164 isSimpleNgrams (SimpleNgrams _) = True
165 isSimpleNgrams _ = False
167 ------------------------------------------------------------------------
168 -- | Terms from 'Text'
169 -- 'Mono' : mono terms
170 -- 'Multi' : multi terms
171 -- 'MonoMulti' : mono and multi
172 -- TODO : multi terms should exclude mono (intersection is not empty yet)
173 terms :: TermType Lang -> Text -> IO [TermsWithCount]
174 terms (Mono lang) txt = pure $ monoTerms lang txt
175 terms (Multi lang) txt = multiterms lang txt
176 terms (MonoMulti lang) txt = terms (Multi lang) txt
177 terms (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
179 m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
180 -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
183 ------------------------------------------------------------------------
184 type WindowSize = Int
185 type MinNgramSize = Int
187 -- | Unsupervised ngrams extraction
188 -- language agnostic extraction
189 -- TODO: newtype BlockText
190 termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
191 termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panic "[termsUnsupervised] no model"
192 termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
193 map (\(t, cnt) -> (text2term _tt_lang t, cnt))
196 . (List.filter (\l' -> List.length l' >= _tt_windowSize))
198 . mainEleveWith _tt_model _tt_ngramsSize
200 termsUnsupervised _ = undefined
203 newTries :: Int -> Text -> Tries Token ()
204 newTries n t = buildTries n (fmap toToken $ uniText t)
206 -- | TODO removing long terms > 24
207 uniText :: Text -> [[Text]]
208 uniText = map (List.filter (not . isPunctuation))
210 . sentences -- TODO get sentences according to lang
213 text2term :: Lang -> [Text] -> Terms
214 text2term _ [] = Terms [] Set.empty
215 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
217 isPunctuation :: Text -> Bool
218 isPunctuation x = List.elem x $ (Text.pack . pure)
219 <$> ("!?(),;.:" :: String)