2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17 {-# LANGUAGE FlexibleContexts #-}
19 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
22 --import Control.Lens (view)
23 import Control.Monad (mapM_)
24 import Control.Monad.IO.Class (liftIO)
25 --import Gargantext.Core.Types
26 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
27 import Data.Map (Map, lookup, fromListWith, toList)
28 import Data.Maybe (Maybe(..), catMaybes)
30 import Data.Text (Text, splitOn, intercalate)
31 import Data.Tuple.Extra (both)
32 import Data.List (concat)
33 import GHC.Show (Show)
34 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
35 import Gargantext.Core.Types.Individu (Username)
36 import Gargantext.Core.Types.Main
37 import Gargantext.Core (Lang(..))
38 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
39 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
40 import Gargantext.Database.Metrics.TFICF (getTficf)
41 import Gargantext.Text.Terms (extractTerms)
42 import Gargantext.Text.Metrics.TFICF (Tficf(..))
43 import Gargantext.Database.Node.Document.Add (add)
44 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
45 import Gargantext.Database.Root (getRoot)
46 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
47 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
48 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
49 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
50 import Gargantext.Database.Schema.User (getUser, UserLight(..))
51 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
52 import Gargantext.Database.Utils (Cmd, CmdM)
53 import Gargantext.Text.Terms (TermType(..))
54 import Gargantext.Ext.IMT (toSchoolName)
55 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
56 import Gargantext.Prelude
57 import Gargantext.Text.Parsers (parseDocs, FileFormat)
58 import System.FilePath (FilePath)
59 import Gargantext.API.Ngrams (HasRepoVar)
60 import Servant (ServantErr)
61 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
62 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
63 import qualified Data.Map as DM
65 type FlowCmdM env err m =
73 flowCorpus :: FlowCmdM env ServantErr m
74 => FileFormat -> FilePath -> CorpusName -> m CorpusId
75 flowCorpus ff fp cName = do
76 --insertUsers [gargantuaUser, simpleUser]
77 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
78 params <- flowInsert NodeCorpus hyperdataDocuments' cName
79 flowCorpus' NodeCorpus hyperdataDocuments' params
82 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
83 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
84 flowInsert _nt hyperdataDocuments cName = do
85 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
90 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
91 _ <- add userCorpusId (map reId ids)
93 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
97 -- check userId CanFillUserCorpus userCorpusId
98 -- check masterUserId CanFillMasterCorpus masterCorpusId
102 -- InsertedNodeNgrams ?
103 flowCorpus' :: FlowCmdM env err m
104 => NodeType -> [HyperdataDocument]
105 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
107 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
108 --------------------------------------------------
110 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
111 --printDebug "documentsWithId" documentsWithId
112 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
113 --printDebug "docsWithNgrams" docsWithNgrams
114 let maps = mapNodeIdNgrams docsWithNgrams
116 --printDebug "maps" (maps)
117 terms2id <- insertNgrams $ DM.keys maps
118 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
119 --printDebug "inserted ngrams" indexedNgrams
120 _ <- insertToNodeNgrams indexedNgrams
123 _masterListId <- flowList masterUserId masterCorpusId indexedNgrams
124 _userListId <- flowListUser userId userCorpusId 500
125 --printDebug "Working on User ListId : " userListId
127 --------------------------------------------------
128 _ <- mkDashboard userCorpusId userId
129 _ <- mkGraph userCorpusId userId
132 -- _ <- mkAnnuaire rootUserId userId
135 -- del [corpusId2, corpusId]
137 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
138 flowCorpus' _ _ _ = undefined
141 type CorpusName = Text
143 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
144 subFlowCorpus username cName = do
145 maybeUserId <- getUser username
146 userId <- case maybeUserId of
147 Nothing -> nodeError NoUserFound
148 -- mk NodeUser gargantua_id "Node Gargantua"
149 Just user -> pure $ userLight_id user
151 --printDebug "userId" userId
152 rootId' <- map _node_id <$> getRoot username
154 --printDebug "rootId'" rootId'
155 rootId'' <- case rootId' of
156 [] -> mkRoot username userId
157 n -> case length n >= 2 of
158 True -> nodeError ManyNodeUsers
159 False -> pure rootId'
161 --printDebug "rootId''" rootId''
162 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
164 corpusId'' <- if username == userMaster
166 ns <- getCorporaWithParentId rootId
167 pure $ map _node_id ns
171 corpusId' <- if corpusId'' /= []
173 else mkCorpus (Just cName) Nothing rootId userId
175 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
177 --printDebug "(username, userId, rootId, corpusId)"
178 -- (username, userId, rootId, corpusId)
179 pure (userId, rootId, corpusId)
182 ------------------------------------------------------------------------
183 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
184 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
186 err = "Database.Flow.toInsert"
188 toInserted :: [ReturnId] -> Map HashId ReturnId
189 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
190 . filter (\r -> reInserted r == True)
192 data DocumentWithId =
193 DocumentWithId { documentId :: !NodeId
194 , documentData :: !HyperdataDocument
197 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
198 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
200 toDocumentWithId (hash,hpd) =
201 DocumentWithId <$> fmap reId (lookup hash rs)
204 ------------------------------------------------------------------------
205 data DocumentIdWithNgrams =
207 { documentWithId :: !DocumentWithId
208 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
212 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
213 extractNgramsT doc = do
214 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
215 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
216 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
217 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
218 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
220 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
221 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
222 <> [(a', DM.singleton Authors 1) | a' <- authors ]
223 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
227 documentIdWithNgrams :: HasNodeError err
228 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
229 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
230 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
232 toDocumentIdWithNgrams d = do
233 e <- f $ documentData d
234 pure $ DocumentIdWithNgrams d e
236 -- | TODO check optimization
237 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
238 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
240 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
241 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
243 nId = documentId $ documentWithId d
245 ------------------------------------------------------------------------
246 flowList :: FlowCmdM env err m => UserId -> CorpusId
247 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int))
249 flowList uId cId ngs = do
250 --printDebug "ngs:" ngs
251 lId <- getOrMkList cId uId
252 --printDebug "ngs" (DM.keys ngs)
253 -- TODO add stemming equivalence of 2 ngrams
255 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
256 -- _ <- insertGroups lId groupEd
258 -- compute Candidate / Map
259 _is <- mapM_ (\(typeList, ngElements) -> putListNgrams lId typeList ngElements) $ toList $ ngrams2list' ngs
260 --printDebug "listNgrams inserted :" is
264 flowListUser :: FlowCmdM env err m
265 => UserId -> CorpusId -> Int -> m ListId
266 flowListUser uId cId n = do
267 lId <- getOrMkList cId uId
269 ngs <- take n <$> sortWith tficf_score
270 <$> getTficf userMaster cId lId NgramsTerms
272 putListNgrams lId NgramsTerms $
273 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
278 ------------------------------------------------------------------------
283 * DM.keys called twice
284 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
285 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
286 -> Map NgramsIndexed NgramsIndexed
287 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
291 -- TODO check: do not insert duplicates
292 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
293 insertGroups lId ngrs =
294 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
295 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
299 ------------------------------------------------------------------------
300 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
301 -> [(ListType, (NgramsType, NgramsIndexed))]
303 [ (CandidateList, (t, ng))
304 | (ng, tm) <- DM.toList m
308 ngrams2list' :: Map NgramsIndexed (Map NgramsType a)
309 -> Map NgramsType [NgramsElement]
310 ngrams2list' m = fromListWith (<>)
311 [ (t, [NgramsElement (_ngramsTerms $ _ngrams ng) CandidateList 1 Nothing mempty])
312 | (ng, tm) <- DM.toList m
320 -- | TODO: weight of the list could be a probability
321 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
322 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
323 | (l,(ngt, ng)) <- lngs
325 ------------------------------------------------------------------------
330 flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
331 flowAnnuaire filePath = do
332 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
333 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
334 printDebug "length annuaire" ps
337 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
338 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
339 flowInsertAnnuaire name children = do
341 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
342 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
344 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
345 _ <- add userCorpusId (map reId ids)
347 printDebug "AnnuaireID" userCorpusId
349 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
352 subFlowAnnuaire :: HasNodeError err =>
353 Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
354 subFlowAnnuaire username _cName = do
355 maybeUserId <- getUser username
357 userId <- case maybeUserId of
358 Nothing -> nodeError NoUserFound
359 -- mk NodeUser gargantua_id "Node Gargantua"
360 Just user -> pure $ userLight_id user
362 rootId' <- map _node_id <$> getRoot username
364 rootId'' <- case rootId' of
365 [] -> mkRoot username userId
366 n -> case length n >= 2 of
367 True -> nodeError ManyNodeUsers
368 False -> pure rootId'
369 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
371 corpusId' <- mkAnnuaire rootId userId
373 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
375 printDebug "(username, userId, rootId, corpusId)"
376 (username, userId, rootId, corpusId)
377 pure (userId, rootId, corpusId)