2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
37 , getOrMk_RootWithCorpus
46 import Control.Lens ((^.), view, _Just, makeLenses)
47 import Data.Aeson.TH (deriveJSON)
49 import Data.List (concat)
50 import qualified Data.Map as Map
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes)
55 import Data.Text (splitOn, intercalate)
56 import Data.Traversable (traverse)
57 import Data.Tuple.Extra (first, second)
58 import GHC.Generics (Generic)
59 import System.FilePath (FilePath)
61 import Gargantext.Core (Lang(..))
62 import Gargantext.Core.Flow.Types
63 import Gargantext.Core.Types (Terms(..))
64 import Gargantext.Core.Types.Individu (User(..))
65 import Gargantext.Core.Types.Main
66 import Gargantext.Database.Action.Flow.List
67 import Gargantext.Database.Action.Flow.Types
68 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
69 import Gargantext.Database.Query.Table.Node
70 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
71 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
72 import Gargantext.Database.Action.Search (searchDocInDatabase)
73 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
74 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
75 import Gargantext.Database.Admin.Types.Hyperdata
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Core.Ext.IMT (toSchoolName)
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
84 import Gargantext.Core.Text
85 import Gargantext.Prelude
86 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Core.Text.Terms
90 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
91 import qualified Gargantext.Core.Text.Corpus.API as API
93 ------------------------------------------------------------------------
94 -- TODO use internal with API name (could be old data)
95 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
96 | ExternalOrigin { _do_api :: API.ExternalAPIs }
98 deriving (Generic, Eq)
100 makeLenses ''DataOrigin
101 deriveJSON (unPrefix "_do_") ''DataOrigin
102 instance ToSchema DataOrigin where
103 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
105 allDataOrigins :: [DataOrigin]
106 allDataOrigins = map InternalOrigin API.externalAPIs
107 <> map ExternalOrigin API.externalAPIs
110 data DataText = DataOld ![NodeId]
111 | DataNew ![[HyperdataDocument]]
113 -- TODO use the split parameter in config file
114 getDataText :: FlowCmdM env err m
120 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
122 <$> API.get api (_tt_lang la) q li
123 getDataText (InternalOrigin _) _la q _li = do
124 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
125 (UserName userMaster)
127 (Nothing :: Maybe HyperdataCorpus)
128 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
131 -------------------------------------------------------------------------------
132 flowDataText :: FlowCmdM env err m
138 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
140 corpusType = (Nothing :: Maybe HyperdataCorpus)
141 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
143 ------------------------------------------------------------------------
145 flowAnnuaire :: FlowCmdM env err m
147 -> Either CorpusName [CorpusId]
151 flowAnnuaire u n l filePath = do
152 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
153 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
155 ------------------------------------------------------------------------
156 flowCorpusFile :: FlowCmdM env err m
158 -> Either CorpusName [CorpusId]
159 -> Limit -- Limit the number of docs (for dev purpose)
160 -> TermType Lang -> FileFormat -> FilePath
162 flowCorpusFile u n l la ff fp = do
163 docs <- liftBase ( splitEvery 500
167 flowCorpus u n la (map (map toHyperdataDocument) docs)
169 ------------------------------------------------------------------------
170 -- | TODO improve the needed type to create/update a corpus
171 -- (For now, Either is enough)
172 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
174 -> Either CorpusName [CorpusId]
178 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
181 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
184 -> Either CorpusName [CorpusId]
188 flow c u cn la docs = do
189 -- TODO if public insertMasterDocs else insertUserDocs
190 ids <- traverse (insertMasterDocs c la) docs
191 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
193 ------------------------------------------------------------------------
194 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
197 -> Either CorpusName [CorpusId]
201 flowCorpusUser l user corpusName ctype ids = do
203 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
204 listId <- getOrMkList userCorpusId userId
205 _cooc <- insertDefaultNode NodeListCooc listId userId
206 -- TODO: check if present already, ignore
207 _ <- Doc.add userCorpusId ids
209 _tId <- insertDefaultNode NodeTexts userCorpusId userId
210 -- printDebug "Node Text Id" tId
213 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
214 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
215 _userListId <- flowList_DbRepo listId ngs
216 _mastListId <- getOrMkList masterCorpusId masterUserId
217 -- _ <- insertOccsUpdates userCorpusId mastListId
218 -- printDebug "userListId" userListId
220 _ <- insertDefaultNode NodeDashboard userCorpusId userId
221 _ <- insertDefaultNode NodeGraph userCorpusId userId
222 --_ <- mkPhylo userCorpusId userId
224 -- _ <- mkAnnuaire rootUserId userId
227 -- TODO Type NodeDocumentUnicised
228 insertDocs :: ( FlowCmdM env err m
234 -> m ([DocId], [DocumentWithId a])
235 insertDocs hs uId cId = do
236 let docs = map addUniqId hs
237 ids <- insertDb uId cId docs
240 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
241 _ <- Doc.add cId ids'
242 pure (ids', documentsWithId)
245 insertMasterDocs :: ( FlowCmdM env err m
253 insertMasterDocs c lang hs = do
254 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
255 (ids', documentsWithId) <- insertDocs hs masterUserId masterCorpusId
258 -- create a corpus with database name (CSV or PubMed)
259 -- add documents to the corpus (create node_node link)
260 -- this will enable global database monitoring
262 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
263 mapNgramsDocs <- mapNodeIdNgrams
264 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
266 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
268 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
271 lId <- getOrMkList masterCorpusId masterUserId
272 mapCgramsId <- listInsertDb lId toNodeNgramsW'
273 $ map (first _ngramsTerms . second Map.keys)
274 $ Map.toList mapNgramsDocs
276 _return <- insertNodeNodeNgrams2
277 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
278 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
279 <*> Just (fromIntegral w :: Double)
280 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
281 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
282 , (nId, w) <- Map.toList mapNodeIdWeight
285 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
287 _ <- insertDocNgrams lId indexedNgrams
290 ------------------------------------------------------------------------
291 ------------------------------------------------------------------------
292 viewUniqId' :: UniqId a
295 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
297 err = panic "[ERROR] Database.Flow.toInsert"
300 toInserted :: [ReturnId]
301 -> Map HashId ReturnId
303 Map.fromList . map (\r -> (reUniqId r, r) )
304 . filter (\r -> reInserted r == True)
306 mergeData :: Map HashId ReturnId
308 -> [DocumentWithId a]
309 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
311 toDocumentWithId (sha,hpd) =
312 DocumentWithId <$> fmap reId (lookup sha rs)
315 ------------------------------------------------------------------------
316 instance HasText HyperdataContact
319 ------------------------------------------------------------------------
320 ------------------------------------------------------------------------
321 documentIdWithNgrams :: HasNodeError err
323 -> Cmd err (Map Ngrams (Map NgramsType Int)))
324 -> [DocumentWithId a]
325 -> Cmd err [DocumentIdWithNgrams a]
326 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
328 toDocumentIdWithNgrams d = do
329 e <- f $ documentData d
330 pure $ DocumentIdWithNgrams d e
332 ------------------------------------------------------------------------
333 instance ExtractNgramsT HyperdataContact
335 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
337 extract :: TermType Lang -> HyperdataContact
338 -> Cmd err (Map Ngrams (Map NgramsType Int))
340 let authors = map text2ngrams
341 $ maybe ["Nothing"] (\a -> [a])
342 $ view (hc_who . _Just . cw_lastName) hc'
344 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
346 instance HasText HyperdataDocument
348 hasText h = catMaybes [ _hd_title h
352 instance ExtractNgramsT HyperdataDocument
354 extractNgramsT :: TermType Lang
356 -> Cmd err (Map Ngrams (Map NgramsType Int))
357 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
359 extractNgramsT' :: TermType Lang
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT' lang' doc = do
363 let source = text2ngrams
364 $ maybe "Nothing" identity
367 institutes = map text2ngrams
368 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
371 authors = map text2ngrams
372 $ maybe ["Nothing"] (splitOn ", ")
375 terms' <- map text2ngrams
376 <$> map (intercalate " " . _terms_label)
378 <$> liftBase (extractTerms lang' $ hasText doc)
380 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
381 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
382 <> [(a', Map.singleton Authors 1) | a' <- authors ]
383 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]