2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
68 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
69 import Gargantext.Core.Text.Terms
70 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
71 import Gargantext.Core.Types (Terms(..))
72 import Gargantext.Core.Types.Individu (User(..))
73 import Gargantext.Core.Types.Main
74 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
75 import Gargantext.Database.Action.Flow.List
76 import Gargantext.Database.Action.Flow.Types
77 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
78 import Gargantext.Database.Action.Search (searchDocInDatabase)
79 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
80 import Gargantext.Database.Admin.Types.Hyperdata
81 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
82 import Gargantext.Database.Prelude
83 import Gargantext.Database.Query.Table.Ngrams
84 import Gargantext.Database.Query.Table.Node
85 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
86 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
87 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
88 import Gargantext.Database.Query.Table.NodeNodeNgrams2
89 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
90 import Gargantext.Database.Schema.Node (NodePoly(..))
91 import Gargantext.Prelude
92 import Gargantext.Prelude.Crypto.Hash (Hash)
93 import qualified Gargantext.Core.Text.Corpus.API as API
94 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
96 ------------------------------------------------------------------------
97 -- TODO use internal with API name (could be old data)
98 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
99 | ExternalOrigin { _do_api :: API.ExternalAPIs }
101 deriving (Generic, Eq)
103 makeLenses ''DataOrigin
104 deriveJSON (unPrefix "_do_") ''DataOrigin
105 instance ToSchema DataOrigin where
106 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
108 allDataOrigins :: [DataOrigin]
109 allDataOrigins = map InternalOrigin API.externalAPIs
110 <> map ExternalOrigin API.externalAPIs
113 data DataText = DataOld ![NodeId]
114 | DataNew ![[HyperdataDocument]]
116 -- TODO use the split parameter in config file
117 getDataText :: FlowCmdM env err m
123 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
125 <$> API.get api (_tt_lang la) q li
126 getDataText (InternalOrigin _) _la q _li = do
127 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
128 (UserName userMaster)
130 (Nothing :: Maybe HyperdataCorpus)
131 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
134 -------------------------------------------------------------------------------
135 flowDataText :: ( FlowCmdM env err m
142 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
144 corpusType = (Nothing :: Maybe HyperdataCorpus)
145 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
147 ------------------------------------------------------------------------
149 flowAnnuaire :: (FlowCmdM env err m)
151 -> Either CorpusName [CorpusId]
155 flowAnnuaire u n l filePath = do
156 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
157 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
159 ------------------------------------------------------------------------
160 flowCorpusFile :: (FlowCmdM env err m)
162 -> Either CorpusName [CorpusId]
163 -> Limit -- Limit the number of docs (for dev purpose)
164 -> TermType Lang -> FileFormat -> FilePath
166 flowCorpusFile u n l la ff fp = do
167 docs <- liftBase ( splitEvery 500
171 flowCorpus u n la (map (map toHyperdataDocument) docs)
173 ------------------------------------------------------------------------
174 -- | TODO improve the needed type to create/update a corpus
175 -- (For now, Either is enough)
176 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
178 -> Either CorpusName [CorpusId]
182 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
185 flow :: ( FlowCmdM env err m
191 -> Either CorpusName [CorpusId]
195 flow c u cn la docs = do
196 -- TODO if public insertMasterDocs else insertUserDocs
197 ids <- traverse (insertMasterDocs c la) docs
198 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
200 ------------------------------------------------------------------------
201 flowCorpusUser :: ( FlowCmdM env err m
206 -> Either CorpusName [CorpusId]
210 flowCorpusUser l user corpusName ctype ids = do
212 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
213 listId <- getOrMkList userCorpusId userId
214 -- _cooc <- insertDefaultNode NodeListCooc listId userId
215 -- TODO: check if present already, ignore
216 _ <- Doc.add userCorpusId ids
218 _tId <- insertDefaultNode NodeTexts userCorpusId userId
219 -- printDebug "Node Text Ids:" tId
222 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
223 ngs <- buildNgramsLists user l 2 3 (StopSize 3) userCorpusId masterCorpusId
224 _userListId <- flowList_DbRepo listId ngs
225 _mastListId <- getOrMkList masterCorpusId masterUserId
226 -- _ <- insertOccsUpdates userCorpusId mastListId
227 -- printDebug "userListId" userListId
229 _ <- insertDefaultNode NodeDashboard userCorpusId userId
230 _ <- insertDefaultNode NodeGraph userCorpusId userId
231 --_ <- mkPhylo userCorpusId userId
233 -- _ <- mkAnnuaire rootUserId userId
237 insertMasterDocs :: ( FlowCmdM env err m
245 insertMasterDocs c lang hs = do
246 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
247 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
248 _ <- Doc.add masterCorpusId ids'
250 -- create a corpus with database name (CSV or PubMed)
251 -- add documents to the corpus (create node_node link)
252 -- this will enable global database monitoring
254 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
255 mapNgramsDocs <- mapNodeIdNgrams
256 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
258 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
260 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
263 lId <- getOrMkList masterCorpusId masterUserId
264 mapCgramsId <- listInsertDb lId toNodeNgramsW'
265 $ map (first _ngramsTerms . second Map.keys)
266 $ Map.toList mapNgramsDocs
268 _return <- insertNodeNodeNgrams2
269 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
270 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
271 <*> Just (fromIntegral w :: Double)
272 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
273 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
274 , (nId, w) <- Map.toList mapNodeIdWeight
277 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
279 _ <- insertDocNgrams lId indexedNgrams
282 ------------------------------------------------------------------------
283 -- TODO Type NodeDocumentUnicised
284 insertDocs :: ( FlowCmdM env err m
291 -> m ([DocId], [DocumentWithId a])
292 insertDocs uId cId hs = do
293 let docs = map addUniqId hs
294 newIds <- insertDb uId cId docs
295 -- printDebug "newIds" newIds
297 newIds' = map reId newIds
298 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
299 _ <- Doc.add cId newIds'
300 pure (newIds', documentsWithId)
304 ------------------------------------------------------------------------
305 viewUniqId' :: UniqId a
308 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
310 err = panic "[ERROR] Database.Flow.toInsert"
313 toInserted :: [ReturnId]
316 Map.fromList . map (\r -> (reUniqId r, r) )
317 . filter (\r -> reInserted r == True)
319 mergeData :: Map Hash ReturnId
321 -> [DocumentWithId a]
322 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
324 toDocumentWithId (sha,hpd) =
325 DocumentWithId <$> fmap reId (lookup sha rs)
328 ------------------------------------------------------------------------
329 instance HasText HyperdataContact
332 ------------------------------------------------------------------------
333 ------------------------------------------------------------------------
334 documentIdWithNgrams :: HasNodeError err
336 -> Cmd err (Map Ngrams (Map NgramsType Int)))
337 -> [DocumentWithId a]
338 -> Cmd err [DocumentIdWithNgrams a]
339 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
341 toDocumentIdWithNgrams d = do
342 e <- f $ documentData d
343 pure $ DocumentIdWithNgrams d e
345 ------------------------------------------------------------------------
346 instance ExtractNgramsT HyperdataContact
348 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
350 extract :: TermType Lang -> HyperdataContact
351 -> Cmd err (Map Ngrams (Map NgramsType Int))
353 let authors = map text2ngrams
354 $ maybe ["Nothing"] (\a -> [a])
355 $ view (hc_who . _Just . cw_lastName) hc'
357 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
359 instance HasText HyperdataDocument
361 hasText h = catMaybes [ _hd_title h
366 instance ExtractNgramsT HyperdataDocument
368 extractNgramsT :: TermType Lang
370 -> Cmd err (Map Ngrams (Map NgramsType Int))
371 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
373 extractNgramsT' :: TermType Lang
375 -> Cmd err (Map Ngrams (Map NgramsType Int))
376 extractNgramsT' lang' doc = do
377 let source = text2ngrams
378 $ maybe "Nothing" identity
381 institutes = map text2ngrams
382 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
385 authors = map text2ngrams
386 $ maybe ["Nothing"] (splitOn ", ")
389 terms' <- map text2ngrams
390 <$> map (intercalate " " . _terms_label)
392 <$> liftBase (extractTerms lang' $ hasText doc)
394 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
395 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
396 <> [(a', Map.singleton Authors 1) | a' <- authors ]
397 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
399 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
401 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
403 instance HasText a => HasText (Node a)
405 hasText (Node _ _ _ _ _ _ _ h) = hasText h