2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
50 import Data.List (concat)
51 import qualified Data.Map as Map
52 import Data.Map (Map, lookup)
53 import Data.Maybe (catMaybes)
56 import Data.Text (splitOn, intercalate)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Ext.IMT (toSchoolName)
64 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
65 import Gargantext.Core.Flow.Types
66 import Gargantext.Core.Text
67 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
68 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
69 import Gargantext.Core.Text.Terms
70 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
71 import Gargantext.Core.Types (Terms(..))
72 import Gargantext.Core.Types.Individu (User(..))
73 import Gargantext.Core.Types.Main
74 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
75 import Gargantext.Database.Action.Flow.List
76 import Gargantext.Database.Action.Flow.Types
77 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
78 import Gargantext.Database.Action.Search (searchDocInDatabase)
79 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
80 import Gargantext.Database.Admin.Types.Hyperdata
81 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
82 import Gargantext.Database.Query.Table.Node
83 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
84 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
85 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
86 import Gargantext.Database.Query.Table.Ngrams
87 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
88 import Gargantext.Database.Query.Table.NodeNodeNgrams2
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Schema.Node (NodePoly(..))
91 import Gargantext.Prelude
92 import Gargantext.Prelude.Crypto.Hash (Hash)
93 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
94 import qualified Gargantext.Core.Text.Corpus.API as API
96 ------------------------------------------------------------------------
97 -- TODO use internal with API name (could be old data)
98 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
99 | ExternalOrigin { _do_api :: API.ExternalAPIs }
101 deriving (Generic, Eq)
103 makeLenses ''DataOrigin
104 deriveJSON (unPrefix "_do_") ''DataOrigin
105 instance ToSchema DataOrigin where
106 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
108 allDataOrigins :: [DataOrigin]
109 allDataOrigins = map InternalOrigin API.externalAPIs
110 <> map ExternalOrigin API.externalAPIs
113 data DataText = DataOld ![NodeId]
114 | DataNew ![[HyperdataDocument]]
116 -- TODO use the split parameter in config file
117 getDataText :: FlowCmdM env err m
123 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
125 <$> API.get api (_tt_lang la) q li
126 getDataText (InternalOrigin _) _la q _li = do
127 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
128 (UserName userMaster)
130 (Nothing :: Maybe HyperdataCorpus)
131 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
134 -------------------------------------------------------------------------------
135 flowDataText :: FlowCmdM env err m
141 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
143 corpusType = (Nothing :: Maybe HyperdataCorpus)
144 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
146 ------------------------------------------------------------------------
148 flowAnnuaire :: FlowCmdM env err m
150 -> Either CorpusName [CorpusId]
154 flowAnnuaire u n l filePath = do
155 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
156 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
158 ------------------------------------------------------------------------
159 flowCorpusFile :: FlowCmdM env err m
161 -> Either CorpusName [CorpusId]
162 -> Limit -- Limit the number of docs (for dev purpose)
163 -> TermType Lang -> FileFormat -> FilePath
165 flowCorpusFile u n l la ff fp = do
166 docs <- liftBase ( splitEvery 500
170 flowCorpus u n la (map (map toHyperdataDocument) docs)
172 ------------------------------------------------------------------------
173 -- | TODO improve the needed type to create/update a corpus
174 -- (For now, Either is enough)
175 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
177 -> Either CorpusName [CorpusId]
181 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
184 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
187 -> Either CorpusName [CorpusId]
191 flow c u cn la docs = do
192 -- TODO if public insertMasterDocs else insertUserDocs
193 ids <- traverse (insertMasterDocs c la) docs
194 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
196 ------------------------------------------------------------------------
197 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
200 -> Either CorpusName [CorpusId]
204 flowCorpusUser l user corpusName ctype ids = do
206 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
207 listId <- getOrMkList userCorpusId userId
208 -- _cooc <- insertDefaultNode NodeListCooc listId userId
209 -- TODO: check if present already, ignore
210 _ <- Doc.add userCorpusId ids
212 _tId <- insertDefaultNode NodeTexts userCorpusId userId
213 -- printDebug "Node Text Ids:" tId
216 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
217 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
218 _userListId <- flowList_DbRepo listId ngs
219 _mastListId <- getOrMkList masterCorpusId masterUserId
220 -- _ <- insertOccsUpdates userCorpusId mastListId
221 -- printDebug "userListId" userListId
223 _ <- insertDefaultNode NodeDashboard userCorpusId userId
224 _ <- insertDefaultNode NodeGraph userCorpusId userId
225 --_ <- mkPhylo userCorpusId userId
227 -- _ <- mkAnnuaire rootUserId userId
231 insertMasterDocs :: ( FlowCmdM env err m
239 insertMasterDocs c lang hs = do
240 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
241 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
242 _ <- Doc.add masterCorpusId ids'
244 -- create a corpus with database name (CSV or PubMed)
245 -- add documents to the corpus (create node_node link)
246 -- this will enable global database monitoring
248 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
249 mapNgramsDocs <- mapNodeIdNgrams
250 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
252 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
254 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
257 lId <- getOrMkList masterCorpusId masterUserId
258 mapCgramsId <- listInsertDb lId toNodeNgramsW'
259 $ map (first _ngramsTerms . second Map.keys)
260 $ Map.toList mapNgramsDocs
262 _return <- insertNodeNodeNgrams2
263 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
264 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
265 <*> Just (fromIntegral w :: Double)
266 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
267 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
268 , (nId, w) <- Map.toList mapNodeIdWeight
271 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
273 _ <- insertDocNgrams lId indexedNgrams
276 ------------------------------------------------------------------------
277 -- TODO Type NodeDocumentUnicised
278 insertDocs :: ( FlowCmdM env err m
285 -> m ([DocId], [DocumentWithId a])
286 insertDocs uId cId hs = do
287 let docs = map addUniqId hs
288 newIds <- insertDb uId cId docs
289 -- printDebug "newIds" newIds
291 newIds' = map reId newIds
292 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
293 _ <- Doc.add cId newIds'
294 pure (newIds', documentsWithId)
298 ------------------------------------------------------------------------
299 viewUniqId' :: UniqId a
302 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
304 err = panic "[ERROR] Database.Flow.toInsert"
307 toInserted :: [ReturnId]
310 Map.fromList . map (\r -> (reUniqId r, r) )
311 . filter (\r -> reInserted r == True)
313 mergeData :: Map Hash ReturnId
315 -> [DocumentWithId a]
316 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
318 toDocumentWithId (sha,hpd) =
319 DocumentWithId <$> fmap reId (lookup sha rs)
322 ------------------------------------------------------------------------
323 instance HasText HyperdataContact
326 ------------------------------------------------------------------------
327 ------------------------------------------------------------------------
328 documentIdWithNgrams :: HasNodeError err
330 -> Cmd err (Map Ngrams (Map NgramsType Int)))
331 -> [DocumentWithId a]
332 -> Cmd err [DocumentIdWithNgrams a]
333 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
335 toDocumentIdWithNgrams d = do
336 e <- f $ documentData d
337 pure $ DocumentIdWithNgrams d e
339 ------------------------------------------------------------------------
340 instance ExtractNgramsT HyperdataContact
342 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
344 extract :: TermType Lang -> HyperdataContact
345 -> Cmd err (Map Ngrams (Map NgramsType Int))
347 let authors = map text2ngrams
348 $ maybe ["Nothing"] (\a -> [a])
349 $ view (hc_who . _Just . cw_lastName) hc'
351 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
353 instance HasText HyperdataDocument
355 hasText h = catMaybes [ _hd_title h
360 instance ExtractNgramsT HyperdataDocument
362 extractNgramsT :: TermType Lang
364 -> Cmd err (Map Ngrams (Map NgramsType Int))
365 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
367 extractNgramsT' :: TermType Lang
369 -> Cmd err (Map Ngrams (Map NgramsType Int))
370 extractNgramsT' lang' doc = do
371 let source = text2ngrams
372 $ maybe "Nothing" identity
375 institutes = map text2ngrams
376 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
379 authors = map text2ngrams
380 $ maybe ["Nothing"] (splitOn ", ")
383 terms' <- map text2ngrams
384 <$> map (intercalate " " . _terms_label)
386 <$> liftBase (extractTerms lang' $ hasText doc)
388 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
389 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
390 <> [(a', Map.singleton Authors 1) | a' <- authors ]
391 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
393 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
395 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
397 instance HasText a => HasText (Node a)
399 hasText (Node _ _ _ _ _ _ _ h) = hasText h