2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
18 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
21 --import Control.Lens (view)
22 import Control.Monad.IO.Class (liftIO)
23 --import Gargantext.Core.Types
24 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
25 import Data.Map (Map, lookup)
26 import Data.Maybe (Maybe(..), catMaybes)
28 import Data.Text (Text, splitOn, intercalate)
29 import Data.Tuple.Extra (both)
30 import Data.List (concat)
31 import GHC.Show (Show)
32 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
33 import Gargantext.Core.Types.Individu (Username)
34 import Gargantext.Core.Types.Main
35 import Gargantext.Core (Lang(..))
36 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
37 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
38 import Gargantext.Database.Metrics.TFICF (getTficf)
39 import Gargantext.Text.Terms (extractTerms)
40 import Gargantext.Text.Metrics.TFICF (Tficf(..))
41 import Gargantext.Database.Node.Document.Add (add)
42 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
43 import Gargantext.Database.Root (getRoot)
44 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
45 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
46 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
47 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
48 import Gargantext.Database.Schema.User (getUser, UserLight(..))
49 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
50 import Gargantext.Database.Utils (Cmd, CmdM)
51 import Gargantext.Text.Terms (TermType(..))
52 import Gargantext.Ext.IMT (toSchoolName)
53 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
54 import Gargantext.Prelude
55 import Gargantext.Text.Parsers (parseDocs, FileFormat)
56 import System.FilePath (FilePath)
58 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
60 import qualified Data.Map as DM
62 type FlowCmdM env err m =
68 flowCorpus :: FlowCmdM env err m => FileFormat -> FilePath -> CorpusName -> m CorpusId
69 flowCorpus ff fp cName = do
70 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
71 params <- flowInsert NodeCorpus hyperdataDocuments' cName
72 flowCorpus' NodeCorpus hyperdataDocuments' params
75 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
76 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
77 flowInsert _nt hyperdataDocuments cName = do
78 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
80 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
81 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
83 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
84 _ <- add userCorpusId (map reId ids)
86 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
89 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
90 flowAnnuaire filePath = do
91 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
92 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
93 printDebug "length annuaire" ps
96 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
97 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
98 flowInsertAnnuaire name children = do
100 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
101 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
103 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
104 _ <- add userCorpusId (map reId ids)
106 --printDebug "AnnuaireID" userCorpusId
108 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
111 -- check userId CanFillUserCorpus userCorpusId
112 -- check masterUserId CanFillMasterCorpus masterCorpusId
116 -- InsertedNodeNgrams ?
117 flowCorpus' :: FlowCmdM env err m
118 => NodeType -> [HyperdataDocument]
119 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
121 flowCorpus' NodeCorpus hyperdataDocuments (ids,_masterUserId,_masterCorpusId, userId,userCorpusId) = do
122 --------------------------------------------------
124 _userListId <- flowListUser userId userCorpusId 500
125 --printDebug "Working on User ListId : " userListId
127 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
128 -- printDebug "documentsWithId" documentsWithId
129 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
130 -- printDebug "docsWithNgrams" docsWithNgrams
131 let maps = mapNodeIdNgrams docsWithNgrams
133 -- printDebug "maps" (maps)
134 terms2id <- insertNgrams $ DM.keys maps
135 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
136 -- printDebug "inserted ngrams" indexedNgrams
137 _ <- insertToNodeNgrams indexedNgrams
139 --listId2 <- flowList masterUserId masterCorpusId indexedNgrams
140 --printDebug "Working on ListId : " listId2
142 --------------------------------------------------
143 _ <- mkDashboard userCorpusId userId
144 _ <- mkGraph userCorpusId userId
147 -- _ <- mkAnnuaire rootUserId userId
150 -- del [corpusId2, corpusId]
152 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
153 flowCorpus' _ _ _ = undefined
156 type CorpusName = Text
158 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
159 subFlowCorpus username cName = do
160 maybeUserId <- getUser username
162 userId <- case maybeUserId of
163 Nothing -> nodeError NoUserFound
164 -- mk NodeUser gargantua_id "Node Gargantua"
165 Just user -> pure $ userLight_id user
167 rootId' <- map _node_id <$> getRoot username
169 rootId'' <- case rootId' of
170 [] -> mkRoot username userId
171 n -> case length n >= 2 of
172 True -> nodeError ManyNodeUsers
173 False -> pure rootId'
174 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
176 corpusId'' <- if username == userMaster
178 ns <- getCorporaWithParentId rootId
179 pure $ map _node_id ns
183 corpusId' <- if corpusId'' /= []
185 else mkCorpus (Just cName) Nothing rootId userId
187 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
189 --printDebug "(username, userId, rootId, corpusId)"
190 -- (username, userId, rootId, corpusId)
191 pure (userId, rootId, corpusId)
194 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
195 subFlowAnnuaire username _cName = do
196 maybeUserId <- getUser username
198 userId <- case maybeUserId of
199 Nothing -> nodeError NoUserFound
200 -- mk NodeUser gargantua_id "Node Gargantua"
201 Just user -> pure $ userLight_id user
203 rootId' <- map _node_id <$> getRoot username
205 rootId'' <- case rootId' of
206 [] -> mkRoot username userId
207 n -> case length n >= 2 of
208 True -> nodeError ManyNodeUsers
209 False -> pure rootId'
210 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
212 corpusId' <- mkAnnuaire rootId userId
214 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
216 --printDebug "(username, userId, rootId, corpusId)"
217 -- (username, userId, rootId, corpusId)
218 pure (userId, rootId, corpusId)
220 ------------------------------------------------------------------------
221 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
222 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
224 err = "Database.Flow.toInsert"
226 toInserted :: [ReturnId] -> Map HashId ReturnId
227 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
228 . filter (\r -> reInserted r == True)
230 data DocumentWithId =
231 DocumentWithId { documentId :: !NodeId
232 , documentData :: !HyperdataDocument
235 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
236 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
238 toDocumentWithId (hash,hpd) =
239 DocumentWithId <$> fmap reId (lookup hash rs)
242 ------------------------------------------------------------------------
243 data DocumentIdWithNgrams =
245 { documentWithId :: !DocumentWithId
246 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
250 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
251 extractNgramsT doc = do
252 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
253 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
254 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
255 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
256 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
258 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
259 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
260 <> [(a', DM.singleton Authors 1) | a' <- authors ]
261 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
265 documentIdWithNgrams :: HasNodeError err
266 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
267 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
268 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
270 toDocumentIdWithNgrams d = do
271 e <- f $ documentData d
272 pure $ DocumentIdWithNgrams d e
274 -- | TODO check optimization
275 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
276 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
278 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
279 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
281 nId = documentId $ documentWithId d
283 ------------------------------------------------------------------------
284 flowList :: HasNodeError err => UserId -> CorpusId
285 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
286 flowList uId cId _ngs = do
287 -- printDebug "ngs:" ngs
288 lId <- getOrMkList cId uId
289 --printDebug "ngs" (DM.keys ngs)
290 -- TODO add stemming equivalence of 2 ngrams
292 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
293 -- _ <- insertGroups lId groupEd
295 -- compute Candidate / Map
296 --is <- insertLists lId $ ngrams2list ngs
297 --printDebug "listNgrams inserted :" is
301 flowListUser :: FlowCmdM env err m
302 => UserId -> CorpusId -> Int -> m NodeId
303 flowListUser uId cId n = do
304 lId <- getOrMkList cId uId
305 -- is <- insertLists lId $ ngrams2list ngs
307 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
308 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
310 insertNewListOfNgramsElements lId NgramsTerms $
311 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
316 ------------------------------------------------------------------------
321 * DM.keys called twice
322 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
323 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
324 -> Map NgramsIndexed NgramsIndexed
325 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
329 -- TODO check: do not insert duplicates
330 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
331 insertGroups lId ngrs =
332 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
333 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
337 ------------------------------------------------------------------------
338 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
339 -> [(ListType, (NgramsType,NgramsIndexed))]
341 [ (CandidateList, (t, ng))
342 | (ng, tm) <- DM.toList m
346 -- | TODO: weight of the list could be a probability
347 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
348 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
349 | (l,(ngt, ng)) <- lngs
351 ------------------------------------------------------------------------