2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE ConstrainedClassMethods #-}
22 {-# LANGUAGE ConstraintKinds #-}
23 {-# LANGUAGE InstanceSigs #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
37 , getOrMk_RootWithCorpus
46 import Control.Lens ((^.), view, _Just, makeLenses)
47 import Data.Aeson.TH (deriveJSON)
49 import Data.List (concat)
50 import qualified Data.Map as Map
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes, fromMaybe)
55 import Data.Text (splitOn, intercalate)
56 import Data.Time.Segment (jour)
57 import Data.Traversable (traverse)
58 import Data.Tuple.Extra (first, second)
59 import GHC.Generics (Generic)
60 import System.FilePath (FilePath)
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Flow.Types
64 import Gargantext.Core.Types (Terms(..))
65 import Gargantext.Core.Types.Individu (User(..))
66 import Gargantext.Core.Types.Main
67 import Gargantext.Database.Action.Flow.List
68 import Gargantext.Database.Action.Flow.Types
69 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
70 import Gargantext.Database.Query.Table.Node
71 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
72 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
73 import Gargantext.Database.Action.Search (searchDocInDatabase)
74 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName, nodeTypeId)
75 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
76 import Gargantext.Database.Admin.Types.Hyperdata
77 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
78 import Gargantext.Database.Prelude
79 import Gargantext.Database.Query.Table.Ngrams
80 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
81 import Gargantext.Database.Query.Table.NodeNodeNgrams2
82 import Gargantext.Database.Schema.Node (NodePoly(..))
83 import Gargantext.Core.Ext.IMT (toSchoolName)
84 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
85 import Gargantext.Core.Ext.IMTUser (deserialiseImtUsersFromFile)
86 import Gargantext.Core.Text
87 import Gargantext.Prelude
88 import Gargantext.Prelude.Crypto.Hash (Hash)
89 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
90 import Gargantext.Core.Text.List (buildNgramsLists,StopSize(..))
91 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
92 import Gargantext.Core.Text.Terms
93 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
94 import qualified Gargantext.Core.Text.Corpus.API as API
95 import qualified Data.Text as DT
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
100 | ExternalOrigin { _do_api :: API.ExternalAPIs }
102 deriving (Generic, Eq)
104 makeLenses ''DataOrigin
105 deriveJSON (unPrefix "_do_") ''DataOrigin
106 instance ToSchema DataOrigin where
107 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
109 allDataOrigins :: [DataOrigin]
110 allDataOrigins = map InternalOrigin API.externalAPIs
111 <> map ExternalOrigin API.externalAPIs
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
117 -- TODO use the split parameter in config file
118 getDataText :: FlowCmdM env err m
124 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
126 <$> API.get api (_tt_lang la) q li
127 getDataText (InternalOrigin _) _la q _li = do
128 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
129 (UserName userMaster)
131 (Nothing :: Maybe HyperdataCorpus)
132 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
135 -------------------------------------------------------------------------------
136 flowDataText :: FlowCmdM env err m
142 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
144 corpusType = (Nothing :: Maybe HyperdataCorpus)
145 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
147 ------------------------------------------------------------------------
149 flowAnnuaire :: FlowCmdM env err m
151 -> Either CorpusName [CorpusId]
155 flowAnnuaire u n l filePath = do
156 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
157 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
159 ------------------------------------------------------------------------
160 flowCorpusFile :: FlowCmdM env err m
162 -> Either CorpusName [CorpusId]
163 -> Limit -- Limit the number of docs (for dev purpose)
164 -> TermType Lang -> FileFormat -> FilePath
166 flowCorpusFile u n l la ff fp = do
167 docs <- liftBase ( splitEvery 500
171 flowCorpus u n la (map (map toHyperdataDocument) docs)
173 ------------------------------------------------------------------------
174 -- | TODO improve the needed type to create/update a corpus
175 -- (For now, Either is enough)
176 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
178 -> Either CorpusName [CorpusId]
182 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
185 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
188 -> Either CorpusName [CorpusId]
192 flow c u cn la docs = do
193 -- TODO if public insertMasterDocs else insertUserDocs
194 ids <- traverse (insertMasterDocs c la) docs
195 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
197 ------------------------------------------------------------------------
198 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
201 -> Either CorpusName [CorpusId]
205 flowCorpusUser l user corpusName ctype ids = do
207 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
208 listId <- getOrMkList userCorpusId userId
209 _cooc <- insertDefaultNode NodeListCooc listId userId
210 -- TODO: check if present already, ignore
211 _ <- Doc.add userCorpusId ids
213 tId <- insertDefaultNode NodeTexts userCorpusId userId
214 printDebug "Node Text Ids:" tId
217 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
218 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
219 _userListId <- flowList_DbRepo listId ngs
220 _mastListId <- getOrMkList masterCorpusId masterUserId
221 -- _ <- insertOccsUpdates userCorpusId mastListId
222 -- printDebug "userListId" userListId
224 _ <- insertDefaultNode NodeDashboard userCorpusId userId
225 _ <- insertDefaultNode NodeGraph userCorpusId userId
226 --_ <- mkPhylo userCorpusId userId
228 -- _ <- mkAnnuaire rootUserId userId
231 -- TODO Type NodeDocumentUnicised
232 insertDocs :: ( FlowCmdM env err m
238 -> m ([DocId], [DocumentWithId a])
239 insertDocs uId cId hs = do
240 let docs = map addUniqId hs
241 newIds <- insertDb uId cId docs
242 printDebug "newIds" newIds
244 newIds' = map reId newIds
245 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
246 _ <- Doc.add cId newIds'
247 pure (newIds', documentsWithId)
251 toNode :: Hyperdata a => NodeType -> ParentId -> UserId -> a -> Node a
252 toNode NodeDocument p u h = Node 0 "" (nodeTypeId nt) u (Just p) n date h
254 n = maybe "No Title" (DT.take 255) (_hd_title h)
256 y = maybe 0 fromIntegral $ _hd_publication_year h
257 m = fromMaybe 1 $ _hd_publication_month h
258 d = fromMaybe 1 $ _hd_publication_day h
259 toNode _ _ _ _ = undefined
263 insertMasterDocs :: ( FlowCmdM env err m
271 insertMasterDocs c lang hs = do
272 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
273 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId hs
274 -- (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode NodeDocument masterCorpusId masterUserId ) hs )
275 _ <- Doc.add masterCorpusId ids'
277 -- create a corpus with database name (CSV or PubMed)
278 -- add documents to the corpus (create node_node link)
279 -- this will enable global database monitoring
281 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
282 mapNgramsDocs <- mapNodeIdNgrams
283 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
285 terms2id <- insertNgrams $ Map.keys mapNgramsDocs
287 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) mapNgramsDocs
290 lId <- getOrMkList masterCorpusId masterUserId
291 mapCgramsId <- listInsertDb lId toNodeNgramsW'
292 $ map (first _ngramsTerms . second Map.keys)
293 $ Map.toList mapNgramsDocs
295 _return <- insertNodeNodeNgrams2
296 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
297 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
298 <*> Just (fromIntegral w :: Double)
299 | (terms'', mapNgramsTypes) <- Map.toList mapNgramsDocs
300 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
301 , (nId, w) <- Map.toList mapNodeIdWeight
304 _cooc <- insertDefaultNode NodeListCooc lId masterUserId
306 _ <- insertDocNgrams lId indexedNgrams
309 ------------------------------------------------------------------------
310 ------------------------------------------------------------------------
311 viewUniqId' :: UniqId a
314 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
316 err = panic "[ERROR] Database.Flow.toInsert"
319 toInserted :: [ReturnId]
322 Map.fromList . map (\r -> (reUniqId r, r) )
323 . filter (\r -> reInserted r == True)
325 mergeData :: Map Hash ReturnId
327 -> [DocumentWithId a]
328 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
330 toDocumentWithId (sha,hpd) =
331 DocumentWithId <$> fmap reId (lookup sha rs)
334 ------------------------------------------------------------------------
335 instance HasText HyperdataContact
338 ------------------------------------------------------------------------
339 ------------------------------------------------------------------------
340 documentIdWithNgrams :: HasNodeError err
342 -> Cmd err (Map Ngrams (Map NgramsType Int)))
343 -> [DocumentWithId a]
344 -> Cmd err [DocumentIdWithNgrams a]
345 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
347 toDocumentIdWithNgrams d = do
348 e <- f $ documentData d
349 pure $ DocumentIdWithNgrams d e
351 ------------------------------------------------------------------------
352 instance ExtractNgramsT HyperdataContact
354 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
356 extract :: TermType Lang -> HyperdataContact
357 -> Cmd err (Map Ngrams (Map NgramsType Int))
359 let authors = map text2ngrams
360 $ maybe ["Nothing"] (\a -> [a])
361 $ view (hc_who . _Just . cw_lastName) hc'
363 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
365 instance HasText HyperdataDocument
367 hasText h = catMaybes [ _hd_title h
371 instance HasText (Node HyperdataDocument)
373 hasText n = catMaybes [ _hd_title h
377 h = _node_hyperdata n
381 instance ExtractNgramsT HyperdataDocument
383 extractNgramsT :: TermType Lang
385 -> Cmd err (Map Ngrams (Map NgramsType Int))
386 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
388 extractNgramsT' :: TermType Lang
390 -> Cmd err (Map Ngrams (Map NgramsType Int))
391 extractNgramsT' lang' doc = do
392 let source = text2ngrams
393 $ maybe "Nothing" identity
396 institutes = map text2ngrams
397 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
400 authors = map text2ngrams
401 $ maybe ["Nothing"] (splitOn ", ")
404 terms' <- map text2ngrams
405 <$> map (intercalate " " . _terms_label)
407 <$> liftBase (extractTerms lang' $ hasText doc)
409 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
410 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
411 <> [(a', Map.singleton Authors 1) | a' <- authors ]
412 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]