2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29 {-# LANGUAGE TemplateHaskell #-}
31 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
41 , getOrMk_RootWithCorpus
50 import Control.Lens ((^.), view, _Just, makeLenses)
51 import Data.Aeson.TH (deriveJSON)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (Maybe(..), catMaybes)
58 import Data.Text (splitOn, intercalate)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import Gargantext.Core (Lang(..))
62 import Gargantext.Core.Flow.Types
63 import Gargantext.Core.Types (Terms(..))
64 import Gargantext.Core.Types.Individu (User(..))
65 import Gargantext.Core.Types.Main
66 import Gargantext.Database.Action.Flow.List
67 import Gargantext.Database.Action.Flow.Types
68 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
69 import Gargantext.Database.Query.Table.Node
70 import Gargantext.Database.Query.Table.Node.Contact -- (HyperdataContact(..), ContactWho(..))
71 import Gargantext.Database.Query.Table.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
72 import Gargantext.Database.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
73 import Gargantext.Database.Action.Search (searchInDatabase)
74 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
75 import Gargantext.Database.Query.Table.Node.Error (HasNodeError(..))
76 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
77 import Gargantext.Database.Prelude
78 import Gargantext.Database.Query.Table.Ngrams
79 import Gargantext.Database.Query.Table.NodeNgrams (listInsertDb , getCgramsId)
80 import Gargantext.Database.Query.Table.NodeNodeNgrams2
81 import Gargantext.Ext.IMT (toSchoolName)
82 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
83 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
84 import Gargantext.Text
85 import Gargantext.Prelude
86 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
89 import Gargantext.Text.Terms
90 import GHC.Generics (Generic)
91 import System.FilePath (FilePath)
92 import qualified Data.Map as Map
93 import qualified Gargantext.Database.Query.Table.Node.Document.Add as Doc (add)
94 import qualified Gargantext.Text.Corpus.API as API
96 ------------------------------------------------------------------------
97 -- TODO use internal with API name (could be old data)
98 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
99 | ExternalOrigin { _do_api :: API.ExternalAPIs }
101 deriving (Generic, Eq)
103 makeLenses ''DataOrigin
104 deriveJSON (unPrefix "_do_") ''DataOrigin
105 instance ToSchema DataOrigin where
106 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
108 allDataOrigins :: [DataOrigin]
109 allDataOrigins = map InternalOrigin API.externalAPIs
110 <> map ExternalOrigin API.externalAPIs
114 data DataText = DataOld ![NodeId]
115 | DataNew ![[HyperdataDocument]]
118 -- TODO use the split parameter in config file
119 getDataText :: FlowCmdM env err m
125 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
127 <$> API.get api (_tt_lang la) q li
128 getDataText (InternalOrigin _) _la q _li = do
129 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
130 (UserName userMaster)
132 (Nothing :: Maybe HyperdataCorpus)
133 ids <- map fst <$> searchInDatabase cId (stemIt q)
136 -------------------------------------------------------------------------------
137 flowDataText :: FlowCmdM env err m
143 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
145 corpusType = (Nothing :: Maybe HyperdataCorpus)
146 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
148 ------------------------------------------------------------------------
150 flowAnnuaire :: FlowCmdM env err m
152 -> Either CorpusName [CorpusId]
156 flowAnnuaire u n l filePath = do
157 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
158 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
160 ------------------------------------------------------------------------
161 flowCorpusFile :: FlowCmdM env err m
163 -> Either CorpusName [CorpusId]
164 -> Limit -- Limit the number of docs (for dev purpose)
165 -> TermType Lang -> FileFormat -> FilePath
167 flowCorpusFile u n l la ff fp = do
168 docs <- liftBase ( splitEvery 500
172 flowCorpus u n la (map (map toHyperdataDocument) docs)
174 ------------------------------------------------------------------------
175 -- | TODO improve the needed type to create/update a corpus
176 -- (For now, Either is enough)
177 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
179 -> Either CorpusName [CorpusId]
183 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
186 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
189 -> Either CorpusName [CorpusId]
193 flow c u cn la docs = do
194 ids <- traverse (insertMasterDocs c la) docs
195 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
197 ------------------------------------------------------------------------
198 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
201 -> Either CorpusName [CorpusId]
205 flowCorpusUser l user corpusName ctype ids = do
207 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
208 listId <- getOrMkList userCorpusId userId
209 _cooc <- mkNode NodeListCooc listId userId
210 -- TODO: check if present already, ignore
211 _ <- Doc.add userCorpusId ids
213 _tId <- mkNode NodeTexts userCorpusId userId
214 -- printDebug "Node Text Id" tId
217 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
218 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
219 _userListId <- flowList_DbRepo listId ngs
220 _mastListId <- getOrMkList masterCorpusId masterUserId
221 -- _ <- insertOccsUpdates userCorpusId mastListId
222 -- printDebug "userListId" userListId
224 _ <- mkDashboard userCorpusId userId
225 _ <- mkGraph userCorpusId userId
226 --_ <- mkPhylo userCorpusId userId
229 -- _ <- mkAnnuaire rootUserId userId
233 insertMasterDocs :: ( FlowCmdM env err m
241 insertMasterDocs c lang hs = do
242 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
244 -- TODO Type NodeDocumentUnicised
245 let docs = map addUniqId hs
246 ids <- insertDb masterUserId masterCorpusId docs
249 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
251 -- create a corpus with database name (CSV or PubMed)
252 -- add documents to the corpus (create node_node link)
253 -- this will enable global database monitoring
255 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
256 maps <- mapNodeIdNgrams
257 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
259 terms2id <- insertNgrams $ Map.keys maps
261 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
264 lId <- getOrMkList masterCorpusId masterUserId
265 mapCgramsId <- listInsertDb lId toNodeNgramsW'
266 $ map (first _ngramsTerms . second Map.keys)
269 _return <- insertNodeNodeNgrams2
270 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
271 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms'')
272 <*> Just (fromIntegral w :: Double)
273 | (terms'', mapNgramsTypes) <- Map.toList maps
274 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
275 , (nId, w) <- Map.toList mapNodeIdWeight
278 _ <- Doc.add masterCorpusId ids'
279 _cooc <- mkNode NodeListCooc lId masterUserId
281 _ <- insertDocNgrams lId indexedNgrams
286 ------------------------------------------------------------------------
290 ------------------------------------------------------------------------
291 viewUniqId' :: UniqId a
294 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
296 err = panic "[ERROR] Database.Flow.toInsert"
299 toInserted :: [ReturnId]
300 -> Map HashId ReturnId
302 Map.fromList . map (\r -> (reUniqId r, r) )
303 . filter (\r -> reInserted r == True)
305 mergeData :: Map HashId ReturnId
307 -> [DocumentWithId a]
308 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
310 toDocumentWithId (sha,hpd) =
311 DocumentWithId <$> fmap reId (lookup sha rs)
314 ------------------------------------------------------------------------
316 instance HasText HyperdataContact
320 ------------------------------------------------------------------------
321 ------------------------------------------------------------------------
323 documentIdWithNgrams :: HasNodeError err
325 -> Cmd err (Map Ngrams (Map NgramsType Int)))
326 -> [DocumentWithId a]
327 -> Cmd err [DocumentIdWithNgrams a]
328 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
330 toDocumentIdWithNgrams d = do
331 e <- f $ documentData d
332 pure $ DocumentIdWithNgrams d e
335 ------------------------------------------------------------------------
338 instance ExtractNgramsT HyperdataContact
340 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
342 extract :: TermType Lang -> HyperdataContact
343 -> Cmd err (Map Ngrams (Map NgramsType Int))
345 let authors = map text2ngrams
346 $ maybe ["Nothing"] (\a -> [a])
347 $ view (hc_who . _Just . cw_lastName) hc'
349 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
351 instance HasText HyperdataDocument
353 hasText h = catMaybes [ _hyperdataDocument_title h
354 , _hyperdataDocument_abstract h
357 instance ExtractNgramsT HyperdataDocument
359 extractNgramsT :: TermType Lang
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
364 extractNgramsT' :: TermType Lang
366 -> Cmd err (Map Ngrams (Map NgramsType Int))
367 extractNgramsT' lang' doc = do
368 let source = text2ngrams
369 $ maybe "Nothing" identity
370 $ _hyperdataDocument_source doc
372 institutes = map text2ngrams
373 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
374 $ _hyperdataDocument_institutes doc
376 authors = map text2ngrams
377 $ maybe ["Nothing"] (splitOn ", ")
378 $ _hyperdataDocument_authors doc
380 terms' <- map text2ngrams
381 <$> map (intercalate " " . _terms_label)
383 <$> liftBase (extractTerms lang' $ hasText doc)
385 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
386 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
387 <> [(a', Map.singleton Authors 1) | a' <- authors ]
388 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]