2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
36 , getOrMk_RootWithCorpus
45 import Control.Lens ((^.), view, _Just, makeLenses)
46 import Data.Aeson.TH (deriveJSON)
48 import Data.List (concat)
49 import Data.Map (Map, lookup)
50 import Data.Maybe (Maybe(..), catMaybes)
53 import Data.Text (splitOn, intercalate)
54 import Data.Traversable (traverse)
55 import Data.Tuple.Extra (first, second)
56 import Gargantext.Core (Lang(..))
57 import Gargantext.Core.Flow.Types
58 import Gargantext.Core.Types (Terms(..))
59 import Gargantext.Core.Types.Individu (User(..))
60 import Gargantext.Core.Types.Main
61 import Gargantext.Database.Action.Flow.List
62 import Gargantext.Database.Action.Flow.Types
63 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
64 import Gargantext.Database.Query.Table.Node
65 import Gargantext.Database.Query.Table.Node.Contact -- (HyperdataContact(..), ContactWho(..))
66 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
67 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
68 import Gargantext.Database.Action.Search (searchInDatabase)
69 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
70 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
71 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
72 import Gargantext.Database.Prelude
73 import Gargantext.Database.Query.Table.Ngrams
74 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
75 import Gargantext.Database.Query.Table.NodeNodeNgrams2
76 import Gargantext.Ext.IMT (toSchoolName)
77 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
78 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
79 import Gargantext.Text
80 import Gargantext.Prelude
81 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
82 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
83 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
84 import Gargantext.Text.Terms
85 import GHC.Generics (Generic)
86 import System.FilePath (FilePath)
87 import qualified Data.Map as Map
88 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
89 import qualified Gargantext.Text.Corpus.API as API
91 ------------------------------------------------------------------------
92 -- TODO use internal with API name (could be old data)
93 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
94 | ExternalOrigin { _do_api :: API.ExternalAPIs }
96 deriving (Generic, Eq)
98 makeLenses ''DataOrigin
99 deriveJSON (unPrefix "_do_") ''DataOrigin
100 instance ToSchema DataOrigin where
101 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
103 allDataOrigins :: [DataOrigin]
104 allDataOrigins = map InternalOrigin API.externalAPIs
105 <> map ExternalOrigin API.externalAPIs
109 data DataText = DataOld ![NodeId]
110 | DataNew ![[HyperdataDocument]]
113 -- TODO use the split parameter in config file
114 getDataText :: FlowCmdM env err m
120 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
122 <$> API.get api (_tt_lang la) q li
123 getDataText (InternalOrigin _) _la q _li = do
124 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
125 (UserName userMaster)
127 (Nothing :: Maybe HyperdataCorpus)
128 ids <- map fst <$> searchInDatabase cId (stemIt q)
131 -------------------------------------------------------------------------------
132 flowDataText :: FlowCmdM env err m
138 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
140 corpusType = (Nothing :: Maybe HyperdataCorpus)
141 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
143 ------------------------------------------------------------------------
145 flowAnnuaire :: FlowCmdM env err m
147 -> Either CorpusName [CorpusId]
151 flowAnnuaire u n l filePath = do
152 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
153 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
155 ------------------------------------------------------------------------
156 flowCorpusFile :: FlowCmdM env err m
158 -> Either CorpusName [CorpusId]
159 -> Limit -- Limit the number of docs (for dev purpose)
160 -> TermType Lang -> FileFormat -> FilePath
162 flowCorpusFile u n l la ff fp = do
163 docs <- liftBase ( splitEvery 500
167 flowCorpus u n la (map (map toHyperdataDocument) docs)
169 ------------------------------------------------------------------------
170 -- | TODO improve the needed type to create/update a corpus
171 -- (For now, Either is enough)
172 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
174 -> Either CorpusName [CorpusId]
178 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
181 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
184 -> Either CorpusName [CorpusId]
188 flow c u cn la docs = do
189 ids <- traverse (insertMasterDocs c la) docs
190 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
192 ------------------------------------------------------------------------
193 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
196 -> Either CorpusName [CorpusId]
200 flowCorpusUser l user corpusName ctype ids = do
202 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
203 listId <- getOrMkList userCorpusId userId
204 _cooc <- mkNode NodeListCooc listId userId
205 -- TODO: check if present already, ignore
206 _ <- Doc.add userCorpusId ids
208 _tId <- mkNode NodeTexts userCorpusId userId
209 -- printDebug "Node Text Id" tId
212 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
213 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
214 _userListId <- flowList_DbRepo listId ngs
215 _mastListId <- getOrMkList masterCorpusId masterUserId
216 -- _ <- insertOccsUpdates userCorpusId mastListId
217 -- printDebug "userListId" userListId
219 _ <- mkDashboard userCorpusId userId
220 _ <- mkGraph userCorpusId userId
221 --_ <- mkPhylo userCorpusId userId
224 -- _ <- mkAnnuaire rootUserId userId
228 insertMasterDocs :: ( FlowCmdM env err m
236 insertMasterDocs c lang hs = do
237 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
239 -- TODO Type NodeDocumentUnicised
240 let docs = map addUniqId hs
241 ids <- insertDb masterUserId masterCorpusId docs
244 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
246 -- create a corpus with database name (CSV or PubMed)
247 -- add documents to the corpus (create node_node link)
248 -- this will enable global database monitoring
250 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
251 maps <- mapNodeIdNgrams
252 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
254 terms2id <- insertNgrams $ Map.keys maps
256 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
259 lId <- getOrMkList masterCorpusId masterUserId
260 mapCgramsId <- listInsertDb lId toNodeNgramsW'
261 $ map (first _ngramsTerms . second Map.keys)
264 _return <- insertNodeNodeNgrams2
265 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
266 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
267 <*> Just (fromIntegral w :: Double)
268 | (terms'', mapNgramsTypes) <- Map.toList maps
269 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
270 , (nId, w) <- Map.toList mapNodeIdWeight
273 _ <- Doc.add masterCorpusId ids'
274 _cooc <- mkNode NodeListCooc lId masterUserId
276 _ <- insertDocNgrams lId indexedNgrams
281 ------------------------------------------------------------------------
285 ------------------------------------------------------------------------
286 viewUniqId' :: UniqId a
289 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
291 err = panic "[ERROR] Database.Flow.toInsert"
294 toInserted :: [ReturnId]
295 -> Map HashId ReturnId
297 Map.fromList . map (\r -> (reUniqId r, r) )
298 . filter (\r -> reInserted r == True)
300 mergeData :: Map HashId ReturnId
302 -> [DocumentWithId a]
303 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
305 toDocumentWithId (sha,hpd) =
306 DocumentWithId <$> fmap reId (lookup sha rs)
309 ------------------------------------------------------------------------
311 instance HasText HyperdataContact
315 ------------------------------------------------------------------------
316 ------------------------------------------------------------------------
318 documentIdWithNgrams :: HasNodeError err
320 -> Cmd err (Map Ngrams (Map NgramsType Int)))
321 -> [DocumentWithId a]
322 -> Cmd err [DocumentIdWithNgrams a]
323 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
325 toDocumentIdWithNgrams d = do
326 e <- f $ documentData d
327 pure $ DocumentIdWithNgrams d e
330 ------------------------------------------------------------------------
333 instance ExtractNgramsT HyperdataContact
335 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
337 extract :: TermType Lang -> HyperdataContact
338 -> Cmd err (Map Ngrams (Map NgramsType Int))
340 let authors = map text2ngrams
341 $ maybe ["Nothing"] (\a -> [a])
342 $ view (hc_who . _Just . cw_lastName) hc'
344 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
346 instance HasText HyperdataDocument
348 hasText h = catMaybes [ _hyperdataDocument_title h
349 , _hyperdataDocument_abstract h
352 instance ExtractNgramsT HyperdataDocument
354 extractNgramsT :: TermType Lang
356 -> Cmd err (Map Ngrams (Map NgramsType Int))
357 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
359 extractNgramsT' :: TermType Lang
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT' lang' doc = do
363 let source = text2ngrams
364 $ maybe "Nothing" identity
365 $ _hyperdataDocument_source doc
367 institutes = map text2ngrams
368 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
369 $ _hyperdataDocument_institutes doc
371 authors = map text2ngrams
372 $ maybe ["Nothing"] (splitOn ", ")
373 $ _hyperdataDocument_authors doc
375 terms' <- map text2ngrams
376 <$> map (intercalate " " . _terms_label)
378 <$> liftBase (extractTerms lang' $ hasText doc)
380 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
381 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
382 <> [(a', Map.singleton Authors 1) | a' <- authors ]
383 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]