]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms.hs
Merge branch 'dev' into dev-openalex
[gargantext.git] / src / Gargantext / Core / Text / Terms.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams
3 Description : Ngrams definition and tools
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 An @n-gram@ is a contiguous sequence of n items from a given sample of
11 text. In Gargantext application the items are words, n is a non negative
12 integer.
13
14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
17 "four-gram", "five-gram", and so on.
18
19 Source: https://en.wikipedia.org/wiki/Ngrams
20
21 TODO
22 group Ngrams -> Tree
23 compute occ by node of Tree
24 group occs according groups
25
26 compute cooccurrences
27 compute graph
28
29 -}
30
31 {-# LANGUAGE ConstrainedClassMethods #-}
32 {-# LANGUAGE StandaloneDeriving #-}
33 {-# LANGUAGE TemplateHaskell #-}
34
35 module Gargantext.Core.Text.Terms
36 where
37
38 import Control.Lens
39 import Data.HashMap.Strict (HashMap)
40 import Data.Hashable (Hashable)
41 import Data.Map.Strict (Map)
42 import Data.Text (Text)
43 import Data.Traversable
44 import GHC.Base (String)
45 import GHC.Generics (Generic)
46 import qualified Data.List as List
47 import qualified Data.Set as Set
48 import qualified Data.Text as Text
49 import qualified Data.HashMap.Strict as HashMap
50 import Gargantext.Core
51 import Gargantext.Core.Utils (groupWithCounts)
52 import Gargantext.Core.Text (sentences, HasText(..))
53 import Gargantext.Core.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
54 import Gargantext.Core.Text.Terms.Mono (monoTerms)
55 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
56 import Gargantext.Core.Text.Terms.Mono.Token.En (tokenize)
57 import Gargantext.Core.Text.Terms.Multi (multiterms)
58 import Gargantext.Core.Types
59 import Gargantext.Database.Prelude (Cmd)
60 import Gargantext.Database.Query.Table.Ngrams (insertNgrams)
61 import Gargantext.Database.Query.Table.NgramsPostag (NgramsPostag(..), insertNgramsPostag, np_form, np_lem)
62 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..), ngramsTerms, text2ngrams, NgramsId)
63 import Gargantext.Prelude
64
65 data TermType lang
66 = Mono { _tt_lang :: !lang }
67 | Multi { _tt_lang :: !lang }
68 | MonoMulti { _tt_lang :: !lang }
69 | Unsupervised { _tt_lang :: !lang
70 , _tt_windowSize :: !Int
71 , _tt_ngramsSize :: !Int
72 , _tt_model :: !(Maybe (Tries Token ()))
73 }
74 deriving (Generic)
75 deriving instance (Show lang) => Show (TermType lang)
76
77 makeLenses ''TermType
78 --group :: [Text] -> [Text]
79 --group = undefined
80
81 -- remove Stop Words
82 -- map (filter (\t -> not . elem t)) $
83 ------------------------------------------------------------------------
84 -- | Sugar to extract terms from text (hidding 'mapM' from end user).
85 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
86 extractTerms :: NLPServerConfig -> TermType Lang -> [Text] -> IO [[TermsWithCount]]
87 extractTerms ncs (Unsupervised {..}) xs = mapM (terms ncs (Unsupervised { _tt_model = Just m', .. })) xs
88 where
89 m' = case _tt_model of
90 Just m''-> m''
91 Nothing -> newTries _tt_windowSize (Text.intercalate " " xs)
92 extractTerms ncs termTypeLang xs = mapM (terms ncs termTypeLang) xs
93
94
95 ------------------------------------------------------------------------
96 withLang :: (Foldable t, Functor t, HasText h)
97 => TermType Lang
98 -> t h
99 -> TermType Lang
100 withLang (Unsupervised {..}) ns = Unsupervised { _tt_model = m', .. }
101 where
102 m' = case _tt_model of
103 Nothing -> -- trace ("buildTries here" :: String)
104 Just $ buildTries _tt_ngramsSize
105 $ fmap toToken
106 $ uniText
107 $ Text.intercalate " . "
108 $ List.concat
109 $ map hasText ns
110 just_m -> just_m
111 withLang l _ = l
112
113 ------------------------------------------------------------------------
114 data ExtractedNgrams = SimpleNgrams { unSimpleNgrams :: Ngrams }
115 | EnrichedNgrams { unEnrichedNgrams :: NgramsPostag }
116 deriving (Eq, Ord, Generic, Show)
117
118 instance Hashable ExtractedNgrams
119
120 -- | A typeclass that represents extracting ngrams from an entity.
121 class ExtractNgramsT h
122 where
123 extractNgramsT :: HasText h
124 => TermType Lang
125 -> h
126 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int, TermsCount))
127 ------------------------------------------------------------------------
128 enrichedTerms :: Lang -> PosTagAlgo -> POS -> Terms -> NgramsPostag
129 enrichedTerms l pa po (Terms ng1 ng2) =
130 NgramsPostag l pa po form lem
131 where
132 form = text2ngrams $ Text.intercalate " " ng1
133 lem = text2ngrams $ Text.intercalate " " $ Set.toList ng2
134
135 ------------------------------------------------------------------------
136 cleanNgrams :: Int -> Ngrams -> Ngrams
137 cleanNgrams s ng
138 | Text.length (ng ^. ngramsTerms) < s = ng
139 | otherwise = text2ngrams (Text.take s (ng ^. ngramsTerms))
140
141 cleanExtractedNgrams :: Int -> ExtractedNgrams -> ExtractedNgrams
142 cleanExtractedNgrams s (SimpleNgrams ng) = SimpleNgrams $ (cleanNgrams s) ng
143 cleanExtractedNgrams s (EnrichedNgrams ng) = EnrichedNgrams $ over np_form (cleanNgrams s)
144 $ over np_lem (cleanNgrams s) ng
145
146 extracted2ngrams :: ExtractedNgrams -> Ngrams
147 extracted2ngrams (SimpleNgrams ng) = ng
148 extracted2ngrams (EnrichedNgrams ng) = view np_form ng
149
150 ---------------------------
151 insertExtractedNgrams :: [ ExtractedNgrams ] -> Cmd err (HashMap Text NgramsId)
152 insertExtractedNgrams ngs = do
153 let (s, e) = List.partition isSimpleNgrams ngs
154 m1 <- insertNgrams (map unSimpleNgrams s)
155 --printDebug "others" m1
156
157 m2 <- insertNgramsPostag (map unEnrichedNgrams e)
158 --printDebug "terms" m2
159
160 let result = HashMap.union m1 m2
161 pure result
162
163 isSimpleNgrams :: ExtractedNgrams -> Bool
164 isSimpleNgrams (SimpleNgrams _) = True
165 isSimpleNgrams _ = False
166
167 ------------------------------------------------------------------------
168 -- | Terms from 'Text'
169 -- 'Mono' : mono terms
170 -- 'Multi' : multi terms
171 -- 'MonoMulti' : mono and multi
172 -- TODO : multi terms should exclude mono (intersection is not empty yet)
173 terms :: NLPServerConfig -> TermType Lang -> Text -> IO [TermsWithCount]
174 terms _ (Mono lang) txt = pure $ monoTerms lang txt
175 terms ncs (Multi lang) txt = multiterms ncs lang txt
176 terms ncs (MonoMulti lang) txt = terms ncs (Multi lang) txt
177 terms _ (Unsupervised { .. }) txt = pure $ termsUnsupervised (Unsupervised { _tt_model = Just m', .. }) txt
178 where
179 m' = maybe (newTries _tt_ngramsSize txt) identity _tt_model
180 -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
181
182
183 ------------------------------------------------------------------------
184 type WindowSize = Int
185 type MinNgramSize = Int
186
187 -- | Unsupervised ngrams extraction
188 -- language agnostic extraction
189 -- TODO: newtype BlockText
190 termsUnsupervised :: TermType Lang -> Text -> [TermsWithCount]
191 termsUnsupervised (Unsupervised { _tt_model = Nothing }) = panic "[termsUnsupervised] no model"
192 termsUnsupervised (Unsupervised { _tt_model = Just _tt_model, .. }) =
193 map (\(t, cnt) -> (text2term _tt_lang t, cnt))
194 . groupWithCounts
195 -- . List.nub
196 . (List.filter (\l' -> List.length l' >= _tt_windowSize))
197 . List.concat
198 . mainEleveWith _tt_model _tt_ngramsSize
199 . uniText
200 termsUnsupervised _ = undefined
201
202
203 newTries :: Int -> Text -> Tries Token ()
204 newTries n t = buildTries n (fmap toToken $ uniText t)
205
206 -- | TODO removing long terms > 24
207 uniText :: Text -> [[Text]]
208 uniText = map (List.filter (not . isPunctuation))
209 . map tokenize
210 . sentences -- TODO get sentences according to lang
211 . Text.toLower
212
213 text2term :: Lang -> [Text] -> Terms
214 text2term _ [] = Terms [] Set.empty
215 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
216
217 isPunctuation :: Text -> Bool
218 isPunctuation x = List.elem x $ (Text.pack . pure)
219 <$> ("!?(),;.:" :: String)