2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17 {-# LANGUAGE FlexibleContexts #-}
19 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
22 --import Control.Lens (view)
23 import Control.Monad.IO.Class (liftIO)
24 --import Gargantext.Core.Types
25 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
26 import Data.Map (Map, lookup)
27 import Data.Maybe (Maybe(..), catMaybes)
29 import Data.Text (Text, splitOn, intercalate)
30 import Data.Tuple.Extra (both)
31 import Data.List (concat)
32 import GHC.Show (Show)
33 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
34 import Gargantext.Core.Types.Individu (Username)
35 import Gargantext.Core.Types.Main
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
38 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
39 import Gargantext.Database.Metrics.TFICF (getTficf)
40 import Gargantext.Text.Terms (extractTerms)
41 import Gargantext.Text.Metrics.TFICF (Tficf(..))
42 import Gargantext.Database.Node.Document.Add (add)
43 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
46 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
47 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
48 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
49 import Gargantext.Database.Schema.User (getUser, UserLight(..))
50 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
51 import Gargantext.Database.Utils (Cmd, CmdM)
52 import Gargantext.Text.Terms (TermType(..))
53 import Gargantext.Ext.IMT (toSchoolName)
54 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
55 import Gargantext.Prelude
56 import Gargantext.Text.Parsers (parseDocs, FileFormat)
57 import System.FilePath (FilePath)
58 import Gargantext.API.Ngrams (HasRepoVar)
59 import Servant (ServantErr)
60 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
61 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
62 import qualified Data.Map as DM
64 type FlowCmdM env err m =
72 flowCorpus :: FlowCmdM env ServantErr m
73 => FileFormat -> FilePath -> CorpusName -> m CorpusId
74 flowCorpus ff fp cName = do
75 --insertUsers [gargantuaUser, simpleUser]
76 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
77 params <- flowInsert NodeCorpus hyperdataDocuments' cName
78 flowCorpus' NodeCorpus hyperdataDocuments' params
81 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
82 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
83 flowInsert _nt hyperdataDocuments cName = do
84 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
86 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
87 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
89 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
90 _ <- add userCorpusId (map reId ids)
92 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
95 flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
96 flowAnnuaire filePath = do
97 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
98 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
99 printDebug "length annuaire" ps
102 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
103 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
104 flowInsertAnnuaire name children = do
106 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
107 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
109 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
110 _ <- add userCorpusId (map reId ids)
112 printDebug "AnnuaireID" userCorpusId
114 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
117 -- check userId CanFillUserCorpus userCorpusId
118 -- check masterUserId CanFillMasterCorpus masterCorpusId
122 -- InsertedNodeNgrams ?
123 flowCorpus' :: FlowCmdM env err m
124 => NodeType -> [HyperdataDocument]
125 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
127 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
128 --------------------------------------------------
130 userListId <- flowListUser userId userCorpusId 500
131 printDebug "Working on User ListId : " userListId
133 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
134 printDebug "documentsWithId" documentsWithId
135 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
136 printDebug "docsWithNgrams" docsWithNgrams
137 let maps = mapNodeIdNgrams docsWithNgrams
139 -- printDebug "maps" (maps)
140 terms2id <- insertNgrams $ DM.keys maps
141 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
142 printDebug "inserted ngrams" indexedNgrams
143 _ <- insertToNodeNgrams indexedNgrams
145 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
146 printDebug "Working on ListId : " listId2
148 --------------------------------------------------
149 _ <- mkDashboard userCorpusId userId
150 _ <- mkGraph userCorpusId userId
153 -- _ <- mkAnnuaire rootUserId userId
156 -- del [corpusId2, corpusId]
158 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
159 flowCorpus' _ _ _ = undefined
162 type CorpusName = Text
164 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
165 subFlowCorpus username cName = do
166 maybeUserId <- getUser username
167 userId <- case maybeUserId of
168 Nothing -> nodeError NoUserFound
169 -- mk NodeUser gargantua_id "Node Gargantua"
170 Just user -> pure $ userLight_id user
172 printDebug "userId" userId
173 rootId' <- map _node_id <$> getRoot username
175 printDebug "rootId'" rootId'
176 rootId'' <- case rootId' of
177 [] -> mkRoot username userId
178 n -> case length n >= 2 of
179 True -> nodeError ManyNodeUsers
180 False -> pure rootId'
182 printDebug "rootId''" rootId''
183 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
185 corpusId'' <- if username == userMaster
187 ns <- getCorporaWithParentId rootId
188 pure $ map _node_id ns
192 corpusId' <- if corpusId'' /= []
194 else mkCorpus (Just cName) Nothing rootId userId
196 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
198 printDebug "(username, userId, rootId, corpusId)"
199 (username, userId, rootId, corpusId)
200 pure (userId, rootId, corpusId)
203 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
204 subFlowAnnuaire username _cName = do
205 maybeUserId <- getUser username
207 userId <- case maybeUserId of
208 Nothing -> nodeError NoUserFound
209 -- mk NodeUser gargantua_id "Node Gargantua"
210 Just user -> pure $ userLight_id user
212 rootId' <- map _node_id <$> getRoot username
214 rootId'' <- case rootId' of
215 [] -> mkRoot username userId
216 n -> case length n >= 2 of
217 True -> nodeError ManyNodeUsers
218 False -> pure rootId'
219 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
221 corpusId' <- mkAnnuaire rootId userId
223 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
225 printDebug "(username, userId, rootId, corpusId)"
226 (username, userId, rootId, corpusId)
227 pure (userId, rootId, corpusId)
229 ------------------------------------------------------------------------
230 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
231 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
233 err = "Database.Flow.toInsert"
235 toInserted :: [ReturnId] -> Map HashId ReturnId
236 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
237 . filter (\r -> reInserted r == True)
239 data DocumentWithId =
240 DocumentWithId { documentId :: !NodeId
241 , documentData :: !HyperdataDocument
244 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
245 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
247 toDocumentWithId (hash,hpd) =
248 DocumentWithId <$> fmap reId (lookup hash rs)
251 ------------------------------------------------------------------------
252 data DocumentIdWithNgrams =
254 { documentWithId :: !DocumentWithId
255 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
259 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
260 extractNgramsT doc = do
261 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
262 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
263 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
264 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
265 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
267 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
268 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
269 <> [(a', DM.singleton Authors 1) | a' <- authors ]
270 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
274 documentIdWithNgrams :: HasNodeError err
275 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
276 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
277 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
279 toDocumentIdWithNgrams d = do
280 e <- f $ documentData d
281 pure $ DocumentIdWithNgrams d e
283 -- | TODO check optimization
284 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
285 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
287 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
288 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
290 nId = documentId $ documentWithId d
292 ------------------------------------------------------------------------
293 flowList :: HasNodeError err => UserId -> CorpusId
294 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
295 flowList uId cId ngs = do
296 -- printDebug "ngs:" ngs
297 lId <- getOrMkList cId uId
298 printDebug "ngs" (DM.keys ngs)
299 -- TODO add stemming equivalence of 2 ngrams
301 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
302 -- _ <- insertGroups lId groupEd
304 -- compute Candidate / Map
305 is <- insertLists lId $ ngrams2list ngs
306 printDebug "listNgrams inserted :" is
310 flowListUser :: FlowCmdM env err m
311 => UserId -> CorpusId -> Int -> m ListId
312 flowListUser uId cId n = do
313 lId <- getOrMkList cId uId
314 -- is <- insertLists lId $ ngrams2list ngs
316 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
317 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
319 insertNewListOfNgramsElements lId NgramsTerms $
320 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
325 ------------------------------------------------------------------------
330 * DM.keys called twice
331 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
332 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
333 -> Map NgramsIndexed NgramsIndexed
334 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
338 -- TODO check: do not insert duplicates
339 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
340 insertGroups lId ngrs =
341 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
342 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
346 ------------------------------------------------------------------------
347 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
348 -> [(ListType, (NgramsType,NgramsIndexed))]
350 [ (CandidateList, (t, ng))
351 | (ng, tm) <- DM.toList m
355 -- | TODO: weight of the list could be a probability
356 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
357 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
358 | (l,(ngt, ng)) <- lngs
360 ------------------------------------------------------------------------