]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FLOW] FIX listUser, but still repo.json is empty
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17 {-# LANGUAGE FlexibleContexts #-}
18
19 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 where
21
22 --import Control.Lens (view)
23 import Control.Monad (mapM_)
24 import Control.Monad.IO.Class (liftIO)
25 --import Gargantext.Core.Types
26 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
27 import Data.Map (Map, lookup, fromListWith, toList)
28 import Data.Maybe (Maybe(..), catMaybes)
29 import Data.Monoid
30 import Data.Text (Text, splitOn, intercalate)
31 import Data.Tuple.Extra (both)
32 import Data.List (concat)
33 import GHC.Show (Show)
34 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
35 import Gargantext.Core.Types.Individu (Username)
36 import Gargantext.Core.Types.Main
37 import Gargantext.Core (Lang(..))
38 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
39 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
40 import Gargantext.Database.Metrics.TFICF (getTficf)
41 import Gargantext.Text.Terms (extractTerms)
42 import Gargantext.Text.Metrics.TFICF (Tficf(..))
43 import Gargantext.Database.Node.Document.Add (add)
44 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
45 import Gargantext.Database.Root (getRoot)
46 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
47 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
48 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
49 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
50 import Gargantext.Database.Schema.User (getUser, UserLight(..))
51 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
52 import Gargantext.Database.Utils (Cmd, CmdM)
53 import Gargantext.Text.Terms (TermType(..))
54 import Gargantext.Ext.IMT (toSchoolName)
55 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
56 import Gargantext.Prelude
57 import Gargantext.Text.Parsers (parseDocs, FileFormat)
58 import System.FilePath (FilePath)
59 import Gargantext.API.Ngrams (HasRepoVar)
60 import Servant (ServantErr)
61 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
62 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
63 import qualified Data.Map as DM
64
65 type FlowCmdM env err m =
66 ( CmdM env err m
67 , RepoCmdM env err m
68 , HasNodeError err
69 , HasRepoVar env
70 )
71
72
73 flowCorpus :: FlowCmdM env ServantErr m
74 => FileFormat -> FilePath -> CorpusName -> m CorpusId
75 flowCorpus ff fp cName = do
76 --insertUsers [gargantuaUser, simpleUser]
77 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
78 params <- flowInsert NodeCorpus hyperdataDocuments' cName
79 flowCorpus' NodeCorpus hyperdataDocuments' params
80
81
82 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
83 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
84 flowInsert _nt hyperdataDocuments cName = do
85 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
86
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
89
90 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
91 _ <- add userCorpusId (map reId ids)
92
93 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
94
95
96 -- TODO-ACCESS:
97 -- check userId CanFillUserCorpus userCorpusId
98 -- check masterUserId CanFillMasterCorpus masterCorpusId
99 --
100 -- TODO-EVENTS:
101 -- InsertedNgrams ?
102 -- InsertedNodeNgrams ?
103 flowCorpus' :: FlowCmdM env err m
104 => NodeType -> [HyperdataDocument]
105 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
106 -> m CorpusId
107 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
108 --------------------------------------------------
109
110 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
111 --printDebug "documentsWithId" documentsWithId
112 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
113 --printDebug "docsWithNgrams" docsWithNgrams
114 let maps = mapNodeIdNgrams docsWithNgrams
115
116 --printDebug "maps" (maps)
117 terms2id <- insertNgrams $ DM.keys maps
118 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
119 --printDebug "inserted ngrams" indexedNgrams
120 _ <- insertToNodeNgrams indexedNgrams
121
122 -- List Ngrams Flow
123 _masterListId <- flowList masterUserId masterCorpusId indexedNgrams
124 _userListId <- flowListUser userId userCorpusId 500
125 --printDebug "Working on User ListId : " userListId
126 --}
127 --------------------------------------------------
128 _ <- mkDashboard userCorpusId userId
129 _ <- mkGraph userCorpusId userId
130
131 -- Annuaire Flow
132 -- _ <- mkAnnuaire rootUserId userId
133
134 pure userCorpusId
135 -- del [corpusId2, corpusId]
136
137 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
138 flowCorpus' _ _ _ = undefined
139
140
141 type CorpusName = Text
142
143 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
144 subFlowCorpus username cName = do
145 maybeUserId <- getUser username
146 userId <- case maybeUserId of
147 Nothing -> nodeError NoUserFound
148 -- mk NodeUser gargantua_id "Node Gargantua"
149 Just user -> pure $ userLight_id user
150
151 --printDebug "userId" userId
152 rootId' <- map _node_id <$> getRoot username
153
154 --printDebug "rootId'" rootId'
155 rootId'' <- case rootId' of
156 [] -> mkRoot username userId
157 n -> case length n >= 2 of
158 True -> nodeError ManyNodeUsers
159 False -> pure rootId'
160
161 --printDebug "rootId''" rootId''
162 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
163
164 corpusId'' <- if username == userMaster
165 then do
166 ns <- getCorporaWithParentId rootId
167 pure $ map _node_id ns
168 else
169 pure []
170
171 corpusId' <- if corpusId'' /= []
172 then pure corpusId''
173 else mkCorpus (Just cName) Nothing rootId userId
174
175 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
176
177 --printDebug "(username, userId, rootId, corpusId)"
178 -- (username, userId, rootId, corpusId)
179 pure (userId, rootId, corpusId)
180
181
182 ------------------------------------------------------------------------
183 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
184 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
185 where
186 err = "Database.Flow.toInsert"
187
188 toInserted :: [ReturnId] -> Map HashId ReturnId
189 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
190 . filter (\r -> reInserted r == True)
191
192 data DocumentWithId =
193 DocumentWithId { documentId :: !NodeId
194 , documentData :: !HyperdataDocument
195 } deriving (Show)
196
197 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
198 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
199 where
200 toDocumentWithId (hash,hpd) =
201 DocumentWithId <$> fmap reId (lookup hash rs)
202 <*> Just hpd
203
204 ------------------------------------------------------------------------
205 data DocumentIdWithNgrams =
206 DocumentIdWithNgrams
207 { documentWithId :: !DocumentWithId
208 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
209 } deriving (Show)
210
211 -- TODO group terms
212 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
213 extractNgramsT doc = do
214 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
215 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
216 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
217 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
218 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
219
220 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
221 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
222 <> [(a', DM.singleton Authors 1) | a' <- authors ]
223 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
224
225
226
227 documentIdWithNgrams :: HasNodeError err
228 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
229 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
230 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
231 where
232 toDocumentIdWithNgrams d = do
233 e <- f $ documentData d
234 pure $ DocumentIdWithNgrams d e
235
236 -- | TODO check optimization
237 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
238 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
239 where
240 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
241 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
242 where
243 nId = documentId $ documentWithId d
244
245 ------------------------------------------------------------------------
246 flowList :: FlowCmdM env err m => UserId -> CorpusId
247 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int))
248 -> m ListId
249 flowList uId cId ngs = do
250 --printDebug "ngs:" ngs
251 lId <- getOrMkList cId uId
252 --printDebug "ngs" (DM.keys ngs)
253 -- TODO add stemming equivalence of 2 ngrams
254 -- TODO needs rework
255 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
256 -- _ <- insertGroups lId groupEd
257
258 -- compute Candidate / Map
259 _is <- mapM_ (\(typeList, ngElements) -> putListNgrams lId typeList ngElements) $ toList $ ngrams2list' ngs
260 --printDebug "listNgrams inserted :" is
261
262 pure lId
263
264 flowListUser :: FlowCmdM env err m
265 => UserId -> CorpusId -> Int -> m ListId
266 flowListUser uId cId n = do
267 lId <- getOrMkList cId uId
268
269 ngs <- take n <$> sortWith tficf_score
270 <$> getTficf userMaster cId lId NgramsTerms
271
272 putListNgrams lId NgramsTerms $
273 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
274 | ng <- ngs ]
275
276 pure lId
277
278 ------------------------------------------------------------------------
279
280 {-
281 TODO rework:
282 * quadratic
283 * DM.keys called twice
284 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
285 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
286 -> Map NgramsIndexed NgramsIndexed
287 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
288 -}
289
290
291 -- TODO check: do not insert duplicates
292 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
293 insertGroups lId ngrs =
294 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
295 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
296 , ng1 /= ng2
297 ]
298
299 ------------------------------------------------------------------------
300 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
301 -> [(ListType, (NgramsType, NgramsIndexed))]
302 ngrams2list m =
303 [ (CandidateList, (t, ng))
304 | (ng, tm) <- DM.toList m
305 , t <- DM.keys tm
306 ]
307
308 ngrams2list' :: Map NgramsIndexed (Map NgramsType a)
309 -> Map NgramsType [NgramsElement]
310 ngrams2list' m = fromListWith (<>)
311 [ (t, [NgramsElement (_ngramsTerms $ _ngrams ng) CandidateList 1 Nothing mempty])
312 | (ng, tm) <- DM.toList m
313 , t <- DM.keys tm
314 ]
315
316
317
318
319
320 -- | TODO: weight of the list could be a probability
321 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
322 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
323 | (l,(ngt, ng)) <- lngs
324 ]
325 ------------------------------------------------------------------------
326
327
328 -- | Annuaire
329
330 flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
331 flowAnnuaire filePath = do
332 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
333 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
334 printDebug "length annuaire" ps
335
336
337 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
338 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
339 flowInsertAnnuaire name children = do
340
341 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
342 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
343
344 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
345 _ <- add userCorpusId (map reId ids)
346
347 printDebug "AnnuaireID" userCorpusId
348
349 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
350
351
352 subFlowAnnuaire :: HasNodeError err =>
353 Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
354 subFlowAnnuaire username _cName = do
355 maybeUserId <- getUser username
356
357 userId <- case maybeUserId of
358 Nothing -> nodeError NoUserFound
359 -- mk NodeUser gargantua_id "Node Gargantua"
360 Just user -> pure $ userLight_id user
361
362 rootId' <- map _node_id <$> getRoot username
363
364 rootId'' <- case rootId' of
365 [] -> mkRoot username userId
366 n -> case length n >= 2 of
367 True -> nodeError ManyNodeUsers
368 False -> pure rootId'
369 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
370
371 corpusId' <- mkAnnuaire rootId userId
372
373 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
374
375 printDebug "(username, userId, rootId, corpusId)"
376 (username, userId, rootId, corpusId)
377 pure (userId, rootId, corpusId)
378
379