]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[TextFlow] ngrams size (WIP)
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
27 ( FlowCmdM
28 , getDataText
29 , flowDataText
30 , flow
31
32 , flowCorpusFile
33 , flowCorpus
34 , flowAnnuaire
35
36 , getOrMkRoot
37 , getOrMk_RootWithCorpus
38 , TermType(..)
39 , DataOrigin(..)
40 , allDataOrigins
41
42 , do_api
43 )
44 where
45
46 import Control.Lens ((^.), view, _Just, makeLenses)
47 import Data.Aeson.TH (deriveJSON)
48 import Data.Either
49 import Data.List (concat)
50 import qualified Data.Map as Map
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes)
53 import Data.Monoid
54 import Data.Swagger
55 import Data.Text (splitOn, intercalate)
56 import Data.Traversable (traverse)
57 import Data.Tuple.Extra (first, second)
58 import GHC.Generics (Generic)
59 import System.FilePath (FilePath)
60
61 import Gargantext.Core (Lang(..))
62 import Gargantext.Core.Flow.Types
63 import Gargantext.Core.Types (Terms(..))
64 import Gargantext.Core.Types.Individu (User(..))
65 import Gargantext.Core.Types.Main
66 import Gargantext.Database.Action.Flow.List
67 import Gargantext.Database.Action.Flow.Types
68 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
69 import Gargantext.Database.Query.Table.Node
70 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
71 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
72 import Gargantext.Database.Action.Search (searchDocInDatabase)
73 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
74 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
75 import Gargantext.Database.Admin.Types.Hyperdata
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Database.Schema.Node (NodePoly(..))
82 import Gargantext.Core.Ext.IMT (toSchoolName)
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
85 import Gargantext.Core.Text
86 import Gargantext.Prelude
87 import Gargantext.Prelude.Crypto.Hash (Hash)
88 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
89 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
90 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
91 import Gargantext.Core.Text.Terms
92 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
93 import qualified Gargantext.Core.Text.Corpus.API as API
94
95 ------------------------------------------------------------------------
96 -- TODO use internal with API name (could be old data)
97 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
98 | ExternalOrigin { _do_api :: API.ExternalAPIs }
99 -- TODO Web
100 deriving (Generic, Eq)
101
102 makeLenses ''DataOrigin
103 deriveJSON (unPrefix "_do_") ''DataOrigin
104 instance ToSchema DataOrigin where
105 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
106
107 allDataOrigins :: [DataOrigin]
108 allDataOrigins = map InternalOrigin API.externalAPIs
109 <> map ExternalOrigin API.externalAPIs
110
111 ---------------
112 data DataText = DataOld ![NodeId]
113 | DataNew ![[HyperdataDocument]]
114
115 -- TODO use the split parameter in config file
116 getDataText :: FlowCmdM env err m
117 => DataOrigin
118 -> TermType Lang
119 -> API.Query
120 -> Maybe API.Limit
121 -> m DataText
122 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
123 <$> splitEvery 500
124 <$> API.get api (_tt_lang la) q li
125 getDataText (InternalOrigin _) _la q _li = do
126 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
127 (UserName userMaster)
128 (Left "")
129 (Nothing :: Maybe HyperdataCorpus)
130 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
131 pure $ DataOld ids
132
133 -------------------------------------------------------------------------------
134 flowDataText :: FlowCmdM env err m
135 => User
136 -> DataText
137 -> TermType Lang
138 -> CorpusId
139 -> m CorpusId
140 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
141 where
142 corpusType = (Nothing :: Maybe HyperdataCorpus)
143 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
144
145 ------------------------------------------------------------------------
146 -- TODO use proxy
147 flowAnnuaire :: FlowCmdM env err m
148 => User
149 -> Either CorpusName [CorpusId]
150 -> (TermType Lang)
151 -> FilePath
152 -> m AnnuaireId
153 flowAnnuaire u n l filePath = do
154 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
155 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
156
157 ------------------------------------------------------------------------
158 flowCorpusFile :: FlowCmdM env err m
159 => User
160 -> Either CorpusName [CorpusId]
161 -> Limit -- Limit the number of docs (for dev purpose)
162 -> TermType Lang -> FileFormat -> FilePath
163 -> m CorpusId
164 flowCorpusFile u n l la ff fp = do
165 docs <- liftBase ( splitEvery 500
166 <$> take l
167 <$> parseFile ff fp
168 )
169 flowCorpus u n la (map (map toHyperdataDocument) docs)
170
171 ------------------------------------------------------------------------
172 -- | TODO improve the needed type to create/update a corpus
173 -- (For now, Either is enough)
174 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
175 => User
176 -> Either CorpusName [CorpusId]
177 -> TermType Lang
178 -> [[a]]
179 -> m CorpusId
180 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
181
182
183 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
184 => Maybe c
185 -> User
186 -> Either CorpusName [CorpusId]
187 -> TermType Lang
188 -> [[a]]
189 -> m CorpusId
190 flow c u cn la docs = do
191 -- TODO if public insertMasterDocs else insertUserDocs
192 ids <- traverse (insertMasterDocs c la) docs
193 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
194
195 ------------------------------------------------------------------------
196 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
197 => Lang
198 -> User
199 -> Either CorpusName [CorpusId]
200 -> Maybe c
201 -> [NodeId]
202 -> m CorpusId
203 flowCorpusUser l user corpusName ctype ids = do
204 -- User Flow
205 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
206 listId <- getOrMkList userCorpusId userId
207 -- _cooc <- insertDefaultNode NodeListCooc listId userId
208 -- TODO: check if present already, ignore
209 _ <- Doc.add userCorpusId ids
210
211 _tId <- insertDefaultNode NodeTexts userCorpusId userId
212 -- printDebug "Node Text Ids:" tId
213
214 -- User List Flow
215 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
216 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
217 _userListId <- flowList_DbRepo listId ngs
218 _mastListId <- getOrMkList masterCorpusId masterUserId
219 -- _ <- insertOccsUpdates userCorpusId mastListId
220 -- printDebug "userListId" userListId
221 -- User Graph Flow
222 _ <- insertDefaultNode NodeDashboard userCorpusId userId
223 _ <- insertDefaultNode NodeGraph userCorpusId userId
224 --_ <- mkPhylo userCorpusId userId
225 -- Annuaire Flow
226 -- _ <- mkAnnuaire rootUserId userId
227 pure userCorpusId
228
229 -- TODO Type NodeDocumentUnicised
230 insertDocs :: ( FlowCmdM env err m
231 -- , FlowCorpus a
232 , FlowInsertDB a
233 )
234 => UserId
235 -> CorpusId
236 -> [a]
237 -> m ([DocId], [DocumentWithId a])
238 insertDocs uId cId hs = do
239 let docs = map addUniqId hs
240 newIds <- insertDb uId cId docs
241 -- printDebug "newIds" newIds
242 let
243 newIds' = map reId newIds
244 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
245 _ <- Doc.add cId newIds'
246 pure (newIds', documentsWithId)
247
248
249 insertMasterDocs :: ( FlowCmdM env err m
250 , FlowCorpus a
251 , MkCorpus c
252 )
253 => Maybe c
254 -> TermType Lang
255 -> [a]
256 -> m [DocId]
257 insertMasterDocs c lang hs = do
258 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
259 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
260 _ <- Doc.add masterCorpusId ids'
261 -- TODO
262 -- create a corpus with database name (CSV or PubMed)
263 -- add documents to the corpus (create node_node link)
264 -- this will enable global database monitoring
265
266 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
267 mapNgramsDocs <- mapNodeIdNgrams
268 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
269
270 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
271 -- to be removed
272 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
273
274 -- new
275 lId <- getOrMkList masterCorpusId masterUserId
276 mapCgramsId <- listInsertDb lId toNodeNgramsW'
277 $ map (first _ngramsTerms . second Map.keys)
278 $ Map.toList mapNgramsDocs
279 -- insertDocNgrams
280 _return <- insertNodeNodeNgrams2
281 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
282 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
283 <*> Just (fromIntegral w :: Double)
284 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
285 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
286 , (nId, w) <- Map.toList mapNodeIdWeight
287 ]
288
289 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
290 -- to be removed
291 _ <- insertDocNgrams lId indexedNgrams
292 pure ids'
293
294 ------------------------------------------------------------------------
295 ------------------------------------------------------------------------
296 viewUniqId' :: UniqId a
297 => a
298 -> (Hash, a)
299 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
300 where
301 err = panic "[ERROR] Database.Flow.toInsert"
302
303
304 toInserted :: [ReturnId]
305 -> Map Hash ReturnId
306 toInserted =
307 Map.fromList . map (\r -> (reUniqId r, r) )
308 . filter (\r -> reInserted r == True)
309
310 mergeData :: Map Hash ReturnId
311 -> Map Hash a
312 -> [DocumentWithId a]
313 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
314 where
315 toDocumentWithId (sha,hpd) =
316 DocumentWithId <$> fmap reId (lookup sha rs)
317 <*> Just hpd
318
319 ------------------------------------------------------------------------
320 instance HasText HyperdataContact
321 where
322 hasText = undefined
323 ------------------------------------------------------------------------
324 ------------------------------------------------------------------------
325 documentIdWithNgrams :: HasNodeError err
326 => (a
327 -> Cmd err (Map Ngrams (Map NgramsType Int)))
328 -> [DocumentWithId a]
329 -> Cmd err [DocumentIdWithNgrams a]
330 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
331 where
332 toDocumentIdWithNgrams d = do
333 e <- f $ documentData d
334 pure $ DocumentIdWithNgrams d e
335
336 ------------------------------------------------------------------------
337 instance ExtractNgramsT HyperdataContact
338 where
339 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
340 where
341 extract :: TermType Lang -> HyperdataContact
342 -> Cmd err (Map Ngrams (Map NgramsType Int))
343 extract _l hc' = do
344 let authors = map text2ngrams
345 $ maybe ["Nothing"] (\a -> [a])
346 $ view (hc_who . _Just . cw_lastName) hc'
347
348 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
349
350 instance HasText HyperdataDocument
351 where
352 hasText h = catMaybes [ _hd_title h
353 , _hd_abstract h
354 ]
355
356
357 instance ExtractNgramsT HyperdataDocument
358 where
359 extractNgramsT :: TermType Lang
360 -> HyperdataDocument
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
363 where
364 extractNgramsT' :: TermType Lang
365 -> HyperdataDocument
366 -> Cmd err (Map Ngrams (Map NgramsType Int))
367 extractNgramsT' lang' doc = do
368 let source = text2ngrams
369 $ maybe "Nothing" identity
370 $ _hd_source doc
371
372 institutes = map text2ngrams
373 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
374 $ _hd_institutes doc
375
376 authors = map text2ngrams
377 $ maybe ["Nothing"] (splitOn ", ")
378 $ _hd_authors doc
379
380 terms' <- map text2ngrams
381 <$> map (intercalate " " . _terms_label)
382 <$> concat
383 <$> liftBase (extractTerms lang' $ hasText doc)
384
385 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
386 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
387 <> [(a', Map.singleton Authors 1) | a' <- authors ]
388 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
389
390 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
391 where
392 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
393
394 instance HasText a => HasText (Node a)
395 where
396 hasText (Node _ _ _ _ _ _ _ h) = hasText h
397
398