2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
68 import Gargantext.Core.Text.List (buildNgramsLists)
69 import Gargantext.Core.Text.Group (StopSize(..), GroupParams(..))
70 import Gargantext.Core.Text.Terms
71 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
72 import Gargantext.Core.Types (Terms(..))
73 import Gargantext.Core.Types.Individu (User(..))
74 import Gargantext.Core.Types.Main
75 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
76 import Gargantext.Database.Action.Flow.List
77 import Gargantext.Database.Action.Flow.Types
78 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
79 import Gargantext.Database.Action.Search (searchDocInDatabase)
80 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
81 import Gargantext.Database.Admin.Types.Hyperdata
82 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
83 import Gargantext.Database.Prelude
84 import Gargantext.Database.Query.Table.Ngrams
85 import Gargantext.Database.Query.Table.Node
86 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
87 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
88 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
89 import Gargantext.Database.Query.Table.NodeNodeNgrams2
90 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
91 import Gargantext.Database.Schema.Node (NodePoly(..))
92 import Gargantext.Prelude
93 import Gargantext.Prelude.Crypto.Hash (Hash)
94 import qualified Gargantext.Core.Text.Corpus.API as API
95 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
100 | ExternalOrigin { _do_api :: API.ExternalAPIs }
102 deriving (Generic, Eq)
104 makeLenses ''DataOrigin
105 deriveJSON (unPrefix "_do_") ''DataOrigin
106 instance ToSchema DataOrigin where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
109 allDataOrigins :: [DataOrigin]
110 allDataOrigins = map InternalOrigin API.externalAPIs
111 <> map ExternalOrigin API.externalAPIs
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
117 -- TODO use the split parameter in config file
118 getDataText :: FlowCmdM env err m
124 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
126 <$> API.get api (_tt_lang la) q li
127 getDataText (InternalOrigin _) _la q _li = do
128 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
129 (UserName userMaster)
131 (Nothing :: Maybe HyperdataCorpus)
132 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
135 -------------------------------------------------------------------------------
136 flowDataText :: ( FlowCmdM env err m
143 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
145 corpusType = (Nothing :: Maybe HyperdataCorpus)
146 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
148 ------------------------------------------------------------------------
150 flowAnnuaire :: (FlowCmdM env err m)
152 -> Either CorpusName [CorpusId]
156 flowAnnuaire u n l filePath = do
157 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
158 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
160 ------------------------------------------------------------------------
161 flowCorpusFile :: (FlowCmdM env err m)
163 -> Either CorpusName [CorpusId]
164 -> Limit -- Limit the number of docs (for dev purpose)
165 -> TermType Lang -> FileFormat -> FilePath
167 flowCorpusFile u n l la ff fp = do
168 docs <- liftBase ( splitEvery 500
172 flowCorpus u n la (map (map toHyperdataDocument) docs)
174 ------------------------------------------------------------------------
175 -- | TODO improve the needed type to create/update a corpus
176 -- (For now, Either is enough)
177 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
179 -> Either CorpusName [CorpusId]
183 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
186 flow :: ( FlowCmdM env err m
192 -> Either CorpusName [CorpusId]
196 flow c u cn la docs = do
197 -- TODO if public insertMasterDocs else insertUserDocs
198 ids <- traverse (insertMasterDocs c la) docs
199 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
201 ------------------------------------------------------------------------
202 flowCorpusUser :: ( FlowCmdM env err m
207 -> Either CorpusName [CorpusId]
211 flowCorpusUser l user corpusName ctype ids = do
213 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
214 -- NodeTexts is first
215 _tId <- insertDefaultNode NodeTexts userCorpusId userId
216 -- printDebug "NodeTexts: " tId
218 -- NodeList is second
219 listId <- getOrMkList userCorpusId userId
220 -- _cooc <- insertDefaultNode NodeListCooc listId userId
221 -- TODO: check if present already, ignore
222 _ <- Doc.add userCorpusId ids
224 -- printDebug "Node Text Ids:" tId
227 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
228 ngs <- buildNgramsLists user (GroupParams l 2 3 (StopSize 3)) userCorpusId masterCorpusId
229 _userListId <- flowList_DbRepo listId ngs
230 _mastListId <- getOrMkList masterCorpusId masterUserId
231 -- _ <- insertOccsUpdates userCorpusId mastListId
232 -- printDebug "userListId" userListId
234 _ <- insertDefaultNode NodeDashboard userCorpusId userId
235 _ <- insertDefaultNode NodeGraph userCorpusId userId
236 --_ <- mkPhylo userCorpusId userId
238 -- _ <- mkAnnuaire rootUserId userId
242 insertMasterDocs :: ( FlowCmdM env err m
250 insertMasterDocs c lang hs = do
251 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
252 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
253 _ <- Doc.add masterCorpusId ids'
255 -- create a corpus with database name (CSV or PubMed)
256 -- add documents to the corpus (create node_node link)
257 -- this will enable global database monitoring
259 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
260 mapNgramsDocs <- mapNodeIdNgrams
261 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
263 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
265 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
268 lId <- getOrMkList masterCorpusId masterUserId
269 mapCgramsId <- listInsertDb lId toNodeNgramsW'
270 $ map (first _ngramsTerms . second Map.keys)
271 $ Map.toList mapNgramsDocs
273 _return <- insertNodeNodeNgrams2
274 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
275 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
276 <*> Just (fromIntegral w :: Double)
277 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
278 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
279 , (nId, w) <- Map.toList mapNodeIdWeight
282 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
284 _ <- insertDocNgrams lId indexedNgrams
287 ------------------------------------------------------------------------
288 -- TODO Type NodeDocumentUnicised
289 insertDocs :: ( FlowCmdM env err m
296 -> m ([DocId], [DocumentWithId a])
297 insertDocs uId cId hs = do
298 let docs = map addUniqId hs
299 newIds <- insertDb uId cId docs
300 -- printDebug "newIds" newIds
302 newIds' = map reId newIds
303 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
304 _ <- Doc.add cId newIds'
305 pure (newIds', documentsWithId)
309 ------------------------------------------------------------------------
310 viewUniqId' :: UniqId a
313 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
315 err = panic "[ERROR] Database.Flow.toInsert"
318 toInserted :: [ReturnId]
321 Map.fromList . map (\r -> (reUniqId r, r) )
322 . filter (\r -> reInserted r == True)
324 mergeData :: Map Hash ReturnId
326 -> [DocumentWithId a]
327 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
329 toDocumentWithId (sha,hpd) =
330 DocumentWithId <$> fmap reId (lookup sha rs)
333 ------------------------------------------------------------------------
334 instance HasText HyperdataContact
337 ------------------------------------------------------------------------
338 ------------------------------------------------------------------------
339 documentIdWithNgrams :: HasNodeError err
341 -> Cmd err (Map Ngrams (Map NgramsType Int)))
342 -> [DocumentWithId a]
343 -> Cmd err [DocumentIdWithNgrams a]
344 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
346 toDocumentIdWithNgrams d = do
347 e <- f $ documentData d
348 pure $ DocumentIdWithNgrams d e
350 ------------------------------------------------------------------------
351 instance ExtractNgramsT HyperdataContact
353 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
355 extract :: TermType Lang -> HyperdataContact
356 -> Cmd err (Map Ngrams (Map NgramsType Int))
358 let authors = map text2ngrams
359 $ maybe ["Nothing"] (\a -> [a])
360 $ view (hc_who . _Just . cw_lastName) hc'
362 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
364 instance HasText HyperdataDocument
366 hasText h = catMaybes [ _hd_title h
371 instance ExtractNgramsT HyperdataDocument
373 extractNgramsT :: TermType Lang
375 -> Cmd err (Map Ngrams (Map NgramsType Int))
376 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
378 extractNgramsT' :: TermType Lang
380 -> Cmd err (Map Ngrams (Map NgramsType Int))
381 extractNgramsT' lang' doc = do
382 let source = text2ngrams
383 $ maybe "Nothing" identity
386 institutes = map text2ngrams
387 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
390 authors = map text2ngrams
391 $ maybe ["Nothing"] (splitOn ", ")
394 terms' <- map text2ngrams
395 <$> map (intercalate " " . _terms_label)
397 <$> liftBase (extractTerms lang' $ hasText doc)
399 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
400 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
401 <> [(a', Map.singleton Authors 1) | a' <- authors ]
402 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
404 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
406 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
408 instance HasText a => HasText (Node a)
410 hasText (Node _ _ _ _ _ _ _ h) = hasText h