2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstrainedClassMethods #-}
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE InstanceSigs #-}
23 {-# LANGUAGE ScopedTypeVariables #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
38 , getOrMk_RootWithCorpus
44 , indexAllDocumentsWithPosTag
48 import Control.Lens ((^.), view, _Just, makeLenses)
49 import Data.Aeson.TH (deriveJSON)
51 import Data.HashMap.Strict (HashMap)
52 import Data.Hashable (Hashable)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (catMaybes)
58 import qualified Data.Text as T
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import GHC.Generics (Generic)
62 import System.FilePath (FilePath)
63 import qualified Data.HashMap.Strict as HashMap
64 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
65 import qualified Data.Map as Map
67 import Gargantext.Core (Lang(..), PosTagAlgo(..))
68 import Gargantext.Core.Ext.IMT (toSchoolName)
69 import Gargantext.Core.Ext.IMTUser (readFile_Annuaire)
70 import Gargantext.Core.Flow.Types
71 import Gargantext.Core.Text
72 import Gargantext.Core.Text.List.Group.WithStem ({-StopSize(..),-} GroupParams(..))
73 import Gargantext.Core.Text.Corpus.Parsers (parseFile, FileFormat)
74 import Gargantext.Core.Text.List (buildNgramsLists)
75 import Gargantext.Core.Text.List.Social (FlowSocialListWith)
76 import Gargantext.Core.Text.Terms
77 import Gargantext.Core.Text.Terms.Mono.Stem.En (stemIt)
78 import Gargantext.Core.Types (POS(NP))
79 import Gargantext.Core.Types.Individu (User(..))
80 import Gargantext.Core.Types.Main
81 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
82 import Gargantext.Database.Action.Flow.List
83 import Gargantext.Database.Action.Flow.Types
84 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams, DocumentIdWithNgrams(..))
85 import Gargantext.Database.Action.Search (searchDocInDatabase)
86 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
87 import Gargantext.Database.Admin.Types.Hyperdata
88 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
89 import Gargantext.Database.Prelude
90 import Gargantext.Database.Query.Table.Ngrams
91 import Gargantext.Database.Query.Table.Node
92 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
93 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
94 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
95 import Gargantext.Database.Query.Table.NodeNodeNgrams2
96 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
97 import Gargantext.Database.Schema.Node (NodePoly(..), node_id)
98 import Gargantext.Database.Types
99 import Gargantext.Prelude
100 import Gargantext.Prelude.Crypto.Hash (Hash)
101 import qualified Gargantext.Core.Text.Corpus.API as API
102 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
104 ------------------------------------------------------------------------
105 -- Imports for upgrade function
106 import Gargantext.Database.Query.Tree.Root (getRootId)
107 import Gargantext.Database.Query.Tree (findNodesId)
108 import qualified Data.List as List
109 ------------------------------------------------------------------------
110 -- TODO use internal with API name (could be old data)
111 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
112 | ExternalOrigin { _do_api :: API.ExternalAPIs }
114 deriving (Generic, Eq)
116 makeLenses ''DataOrigin
117 deriveJSON (unPrefix "_do_") ''DataOrigin
118 instance ToSchema DataOrigin where
119 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
121 allDataOrigins :: [DataOrigin]
122 allDataOrigins = map InternalOrigin API.externalAPIs
123 <> map ExternalOrigin API.externalAPIs
126 data DataText = DataOld ![NodeId]
127 | DataNew ![[HyperdataDocument]]
129 -- TODO use the split parameter in config file
130 getDataText :: FlowCmdM env err m
136 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
138 <$> API.get api (_tt_lang la) q li
140 getDataText (InternalOrigin _) _la q _li = do
141 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
142 (UserName userMaster)
144 (Nothing :: Maybe HyperdataCorpus)
145 ids <- map fst <$> searchDocInDatabase cId (stemIt q)
148 -------------------------------------------------------------------------------
149 flowDataText :: ( FlowCmdM env err m
155 -> Maybe FlowSocialListWith
157 flowDataText u (DataOld ids) tt cid mfslw = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids mfslw
159 corpusType = (Nothing :: Maybe HyperdataCorpus)
160 flowDataText u (DataNew txt) tt cid mfslw = flowCorpus u (Right [cid]) tt mfslw txt
162 ------------------------------------------------------------------------
164 flowAnnuaire :: (FlowCmdM env err m)
166 -> Either CorpusName [CorpusId]
170 flowAnnuaire u n l filePath = do
171 docs <- liftBase $ (( splitEvery 500 <$> readFile_Annuaire filePath) :: IO [[HyperdataContact]])
172 flow (Nothing :: Maybe HyperdataAnnuaire) u n l Nothing docs
174 ------------------------------------------------------------------------
175 flowCorpusFile :: (FlowCmdM env err m)
177 -> Either CorpusName [CorpusId]
178 -> Limit -- Limit the number of docs (for dev purpose)
179 -> TermType Lang -> FileFormat -> FilePath
180 -> Maybe FlowSocialListWith
182 flowCorpusFile u n l la ff fp mfslw = do
183 eParsed <- liftBase $ parseFile ff fp
186 let docs = splitEvery 500 $ take l parsed
187 flowCorpus u n la mfslw (map (map toHyperdataDocument) docs)
188 Left e -> panic $ "Error: " <> (T.pack e)
190 ------------------------------------------------------------------------
191 -- | TODO improve the needed type to create/update a corpus
192 -- (For now, Either is enough)
193 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
195 -> Either CorpusName [CorpusId]
197 -> Maybe FlowSocialListWith
200 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
203 flow :: ( FlowCmdM env err m
209 -> Either CorpusName [CorpusId]
211 -> Maybe FlowSocialListWith
214 flow c u cn la mfslw docs = do
215 -- TODO if public insertMasterDocs else insertUserDocs
216 ids <- traverse (insertMasterDocs c la) docs
217 flowCorpusUser (la ^. tt_lang) u cn c (concat ids) mfslw
219 ------------------------------------------------------------------------
220 flowCorpusUser :: ( FlowCmdM env err m
225 -> Either CorpusName [CorpusId]
228 -> Maybe FlowSocialListWith
230 flowCorpusUser l user corpusName ctype ids mfslw = do
232 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
233 -- NodeTexts is first
234 _tId <- insertDefaultNode NodeTexts userCorpusId userId
235 -- printDebug "NodeTexts: " tId
237 -- NodeList is second
238 listId <- getOrMkList userCorpusId userId
239 -- _cooc <- insertDefaultNode NodeListCooc listId userId
240 -- TODO: check if present already, ignore
241 _ <- Doc.add userCorpusId ids
243 -- printDebug "Node Text Ids:" tId
246 (masterUserId, _masterRootId, masterCorpusId)
247 <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
249 --let gp = (GroupParams l 2 3 (StopSize 3))
250 let gp = GroupWithPosTag l CoreNLP HashMap.empty
251 ngs <- buildNgramsLists user userCorpusId masterCorpusId mfslw gp
253 _userListId <- flowList_DbRepo listId ngs
254 _mastListId <- getOrMkList masterCorpusId masterUserId
255 -- _ <- insertOccsUpdates userCorpusId mastListId
256 -- printDebug "userListId" userListId
258 _ <- insertDefaultNode NodeDashboard userCorpusId userId
259 _ <- insertDefaultNode NodeGraph userCorpusId userId
260 --_ <- mkPhylo userCorpusId userId
262 -- _ <- mkAnnuaire rootUserId userId
266 insertMasterDocs :: ( FlowCmdM env err m
274 insertMasterDocs c lang hs = do
275 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
276 (ids', documentsWithId) <- insertDocs masterUserId masterCorpusId (map (toNode masterUserId masterCorpusId) hs )
277 _ <- Doc.add masterCorpusId ids'
279 -- create a corpus with database name (CSV or PubMed)
280 -- add documents to the corpus (create node_node link)
281 -- this will enable global database monitoring
283 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
284 mapNgramsDocs' :: HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
286 <$> documentIdWithNgrams
287 (extractNgramsT $ withLang lang documentsWithId)
290 lId <- getOrMkList masterCorpusId masterUserId
291 _ <- saveDocNgramsWith lId mapNgramsDocs'
293 -- _cooc <- insertDefaultNode NodeListCooc lId masterUserId
296 saveDocNgramsWith :: ( FlowCmdM env err m)
298 -> HashMap ExtractedNgrams (Map NgramsType (Map NodeId Int))
300 saveDocNgramsWith lId mapNgramsDocs' = do
301 terms2id <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'
302 let mapNgramsDocs = HashMap.mapKeys extracted2ngrams mapNgramsDocs'
305 let indexedNgrams = HashMap.mapKeys (indexNgrams terms2id) mapNgramsDocs
308 mapCgramsId <- listInsertDb lId toNodeNgramsW'
309 $ map (first _ngramsTerms . second Map.keys)
310 $ HashMap.toList mapNgramsDocs
313 _return <- insertNodeNodeNgrams2
314 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
315 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
316 <*> Just (fromIntegral w :: Double)
317 | (terms'', mapNgramsTypes) <- HashMap.toList mapNgramsDocs
318 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
319 , (nId, w) <- Map.toList mapNodeIdWeight
322 _ <- insertDocNgrams lId indexedNgrams
327 ------------------------------------------------------------------------
328 -- TODO Type NodeDocumentUnicised
329 insertDocs :: ( FlowCmdM env err m
336 -> m ([DocId], [Indexed NodeId a])
337 insertDocs uId cId hs = do
338 let docs = map addUniqId hs
339 newIds <- insertDb uId cId docs
340 -- printDebug "newIds" newIds
342 newIds' = map reId newIds
343 documentsWithId = mergeData (toInserted newIds) (Map.fromList $ map viewUniqId' docs)
344 _ <- Doc.add cId newIds'
345 pure (newIds', documentsWithId)
348 ------------------------------------------------------------------------
349 viewUniqId' :: UniqId a
352 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
354 err = panic "[ERROR] Database.Flow.toInsert"
357 toInserted :: [ReturnId]
360 Map.fromList . map (\r -> (reUniqId r, r) )
361 . filter (\r -> reInserted r == True)
363 mergeData :: Map Hash ReturnId
365 -> [Indexed NodeId a]
366 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
368 toDocumentWithId (sha,hpd) =
369 Indexed <$> fmap reId (lookup sha rs)
372 ------------------------------------------------------------------------
373 ------------------------------------------------------------------------
374 ------------------------------------------------------------------------
375 documentIdWithNgrams :: HasNodeError err
377 -> Cmd err (HashMap b (Map NgramsType Int)))
378 -> [Indexed NodeId a]
379 -> Cmd err [DocumentIdWithNgrams a b]
380 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
382 toDocumentIdWithNgrams d = do
384 pure $ DocumentIdWithNgrams d e
387 -- | TODO check optimization
388 mapNodeIdNgrams :: (Ord b, Hashable b)
389 => [DocumentIdWithNgrams a b]
394 mapNodeIdNgrams = HashMap.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
396 f :: DocumentIdWithNgrams a b
397 -> HashMap b (Map NgramsType (Map NodeId Int))
398 f d = fmap (fmap (Map.singleton nId)) $ documentNgrams d
400 nId = _index $ documentWithId d
403 ------------------------------------------------------------------------
404 instance ExtractNgramsT HyperdataContact
406 extractNgramsT l hc = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extract l hc
408 extract :: TermType Lang -> HyperdataContact
409 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
411 let authors = map text2ngrams
412 $ maybe ["Nothing"] (\a -> [a])
413 $ view (hc_who . _Just . cw_lastName) hc'
415 pure $ HashMap.fromList $ [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
418 instance ExtractNgramsT HyperdataDocument
420 extractNgramsT :: TermType Lang
422 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
423 extractNgramsT lang hd = HashMap.mapKeys (cleanExtractedNgrams 255) <$> extractNgramsT' lang hd
425 extractNgramsT' :: TermType Lang
427 -> Cmd err (HashMap ExtractedNgrams (Map NgramsType Int))
428 extractNgramsT' lang' doc = do
429 let source = text2ngrams
430 $ maybe "Nothing" identity
433 institutes = map text2ngrams
434 $ maybe ["Nothing"] (map toSchoolName . (T.splitOn ", "))
437 authors = map text2ngrams
438 $ maybe ["Nothing"] (T.splitOn ", ")
441 terms' <- map (enrichedTerms (lang' ^. tt_lang) CoreNLP NP)
443 <$> liftBase (extractTerms lang' $ hasText doc)
445 pure $ HashMap.fromList
446 $ [(SimpleNgrams source, Map.singleton Sources 1) ]
447 <> [(SimpleNgrams i', Map.singleton Institutes 1) | i' <- institutes ]
448 <> [(SimpleNgrams a', Map.singleton Authors 1) | a' <- authors ]
449 <> [(EnrichedNgrams t', Map.singleton NgramsTerms 1) | t' <- terms' ]
451 instance (ExtractNgramsT a, HasText a) => ExtractNgramsT (Node a)
453 extractNgramsT l (Node _ _ _ _ _ _ _ h) = extractNgramsT l h
455 instance HasText a => HasText (Node a)
457 hasText (Node _ _ _ _ _ _ _ h) = hasText h
461 -- | TODO putelsewhere
462 -- | Upgrade function
463 -- Suppose all documents are English (this is the case actually)
464 indexAllDocumentsWithPosTag :: FlowCmdM env err m => m ()
465 indexAllDocumentsWithPosTag = do
466 rootId <- getRootId (UserName userMaster)
467 corpusIds <- findNodesId rootId [NodeCorpus]
468 docs <- List.concat <$> mapM getDocumentsWithParentId corpusIds
470 _ <- mapM extractInsert (splitEvery 1000 docs)
474 extractInsert :: FlowCmdM env err m => [Node HyperdataDocument] -> m ()
475 extractInsert docs = do
476 let documentsWithId = map (\doc -> Indexed (doc ^. node_id) doc) docs
478 mapNgramsDocs' <- mapNodeIdNgrams
479 <$> documentIdWithNgrams
480 (extractNgramsT $ withLang (Multi EN) documentsWithId)
483 _ <- insertExtractedNgrams $ HashMap.keys mapNgramsDocs'