]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
More refactoring
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17 {-# LANGUAGE FlexibleContexts #-}
18
19 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 where
21
22 --import Debug.Trace (trace)
23 --import Control.Lens (view)
24 import Control.Monad (mapM_)
25 import Control.Monad.IO.Class (liftIO)
26 --import Gargantext.Core.Types
27 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
28 import Data.Map (Map, lookup, fromListWith, toList)
29 import Data.Maybe (Maybe(..), catMaybes)
30 import Data.Monoid
31 import Data.Text (Text, splitOn, intercalate)
32 import Data.Tuple.Extra (both)
33 import Data.List (concat)
34 import GHC.Show (Show)
35 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
36 import Gargantext.Core.Types.Individu (Username)
37 import Gargantext.Core.Types.Main
38 import Gargantext.Core (Lang(..))
39 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
40 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
41 import Gargantext.Database.Metrics.TFICF (getTficf)
42 import Gargantext.Text.Terms (extractTerms)
43 import Gargantext.Text.Metrics.TFICF (Tficf(..))
44 import Gargantext.Database.Node.Document.Add (add)
45 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
46 import Gargantext.Database.Root (getRoot)
47 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
48 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
49 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
50 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
51 import Gargantext.Database.Schema.User (getUser, UserLight(..))
52 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
53 import Gargantext.Database.Utils (Cmd, CmdM)
54 import Gargantext.Text.Terms (TermType(..))
55 import Gargantext.Ext.IMT (toSchoolName)
56 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
57 import Gargantext.Prelude
58 import Gargantext.Text.Parsers (parseDocs, FileFormat)
59 import System.FilePath (FilePath)
60 import Gargantext.API.Ngrams (HasRepoVar)
61 import Servant (ServantErr)
62 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
63 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
64 import qualified Data.Map as DM
65
66 type FlowCmdM env err m =
67 ( CmdM env err m
68 , RepoCmdM env err m
69 , HasNodeError err
70 , HasRepoVar env
71 )
72
73
74 flowCorpus :: FlowCmdM env ServantErr m
75 => FileFormat -> FilePath -> CorpusName -> m CorpusId
76 flowCorpus ff fp cName = do
77 --insertUsers [gargantuaUser, simpleUser]
78 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
79 params <- flowInsert NodeCorpus hyperdataDocuments' cName
80 flowCorpus' NodeCorpus hyperdataDocuments' params
81
82
83 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
84 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
85 flowInsert _nt hyperdataDocuments cName = do
86 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
87
88 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
89 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
90
91 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
92 _ <- add userCorpusId (map reId ids)
93
94 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
95
96
97 -- TODO-ACCESS:
98 -- check userId CanFillUserCorpus userCorpusId
99 -- check masterUserId CanFillMasterCorpus masterCorpusId
100 --
101 -- TODO-EVENTS:
102 -- InsertedNgrams ?
103 -- InsertedNodeNgrams ?
104 flowCorpus' :: FlowCmdM env err m
105 => NodeType -> [HyperdataDocument]
106 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
107 -> m CorpusId
108 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
109 --------------------------------------------------
110
111 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
112 --printDebug "documentsWithId" documentsWithId
113 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
114 --printDebug "docsWithNgrams" docsWithNgrams
115 let maps = mapNodeIdNgrams docsWithNgrams
116
117 --printDebug "maps" (maps)
118 terms2id <- insertNgrams $ DM.keys maps
119 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
120 --printDebug "inserted ngrams" indexedNgrams
121 _ <- insertToNodeNgrams indexedNgrams
122
123 -- List Ngrams Flow
124 _masterListId <- flowList masterUserId masterCorpusId indexedNgrams
125 _userListId <- flowListUser userId userCorpusId 500
126 --------------------------------------------------
127 _ <- mkDashboard userCorpusId userId
128 _ <- mkGraph userCorpusId userId
129
130 -- Annuaire Flow
131 -- _ <- mkAnnuaire rootUserId userId
132
133 pure userCorpusId
134 -- del [corpusId2, corpusId]
135
136 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
137 flowCorpus' _ _ _ = undefined
138
139
140 type CorpusName = Text
141
142 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
143 subFlowCorpus username cName = do
144 maybeUserId <- getUser username
145 userId <- case maybeUserId of
146 Nothing -> nodeError NoUserFound
147 -- mk NodeUser gargantua_id "Node Gargantua"
148 Just user -> pure $ userLight_id user
149
150 --printDebug "userId" userId
151 rootId' <- map _node_id <$> getRoot username
152
153 --printDebug "rootId'" rootId'
154 rootId'' <- case rootId' of
155 [] -> mkRoot username userId
156 n -> case length n >= 2 of
157 True -> nodeError ManyNodeUsers
158 False -> pure rootId'
159
160 --printDebug "rootId''" rootId''
161 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
162
163 corpusId'' <- if username == userMaster
164 then do
165 ns <- getCorporaWithParentId rootId
166 pure $ map _node_id ns
167 else
168 pure []
169
170 corpusId' <- if corpusId'' /= []
171 then pure corpusId''
172 else mkCorpus (Just cName) Nothing rootId userId
173
174 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
175
176 --printDebug "(username, userId, rootId, corpusId)"
177 -- (username, userId, rootId, corpusId)
178 pure (userId, rootId, corpusId)
179
180
181 ------------------------------------------------------------------------
182 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
183 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
184 where
185 err = "Database.Flow.toInsert"
186
187 toInserted :: [ReturnId] -> Map HashId ReturnId
188 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
189 . filter (\r -> reInserted r == True)
190
191 data DocumentWithId =
192 DocumentWithId { documentId :: !NodeId
193 , documentData :: !HyperdataDocument
194 } deriving (Show)
195
196 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
197 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
198 where
199 toDocumentWithId (hash,hpd) =
200 DocumentWithId <$> fmap reId (lookup hash rs)
201 <*> Just hpd
202
203 ------------------------------------------------------------------------
204 data DocumentIdWithNgrams =
205 DocumentIdWithNgrams
206 { documentWithId :: !DocumentWithId
207 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
208 } deriving (Show)
209
210 -- TODO group terms
211 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
212 extractNgramsT doc = do
213 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
214 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
215 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
216 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
217 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
218
219 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
220 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
221 <> [(a', DM.singleton Authors 1) | a' <- authors ]
222 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
223
224
225
226 documentIdWithNgrams :: HasNodeError err
227 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
228 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
229 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
230 where
231 toDocumentIdWithNgrams d = do
232 e <- f $ documentData d
233 pure $ DocumentIdWithNgrams d e
234
235 -- | TODO check optimization
236 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
237 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
238 where
239 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
240 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
241 where
242 nId = documentId $ documentWithId d
243
244 ------------------------------------------------------------------------
245 flowList :: FlowCmdM env err m => UserId -> CorpusId
246 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int))
247 -> m ListId
248 flowList uId cId ngs = do
249 --printDebug "ngs:" ngs
250 lId <- getOrMkList cId uId
251 printDebug "listId flowList" lId
252 --printDebug "ngs" (DM.keys ngs)
253 -- TODO add stemming equivalence of 2 ngrams
254 -- TODO needs rework
255 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
256 -- _ <- insertGroups lId groupEd
257
258 -- compute Candidate / Map
259 mapM_ (\(typeList, ngElmts) -> putListNgrams lId typeList ngElmts) $ toList $ ngrams2list' ngs
260
261 pure lId
262
263 flowListUser :: FlowCmdM env err m
264 => UserId -> CorpusId -> Int -> m ListId
265 flowListUser uId cId n = do
266 lId <- getOrMkList cId uId
267
268 ngs <- take n <$> sortWith tficf_score
269 <$> getTficf userMaster cId lId NgramsTerms
270
271 putListNgrams lId NgramsTerms $
272 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
273 | ng <- ngs ]
274
275 pure lId
276
277 ------------------------------------------------------------------------
278
279 {-
280 TODO rework:
281 * quadratic
282 * DM.keys called twice
283 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
284 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
285 -> Map NgramsIndexed NgramsIndexed
286 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
287 -}
288
289
290 -- TODO check: do not insert duplicates
291 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
292 insertGroups lId ngrs =
293 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
294 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
295 , ng1 /= ng2
296 ]
297
298 ------------------------------------------------------------------------
299 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
300 -> [(ListType, (NgramsType, NgramsIndexed))]
301 ngrams2list m =
302 [ (CandidateList, (t, ng))
303 | (ng, tm) <- DM.toList m
304 , t <- DM.keys tm
305 ]
306
307 ngrams2list' :: Map NgramsIndexed (Map NgramsType a)
308 -> Map NgramsType [NgramsElement]
309 ngrams2list' m = fromListWith (<>)
310 [ (t, [NgramsElement (_ngramsTerms $ _ngrams ng) CandidateList 1 Nothing mempty])
311 | (ng, tm) <- DM.toList m
312 , t <- DM.keys tm
313 ]
314
315
316
317
318
319 -- | TODO: weight of the list could be a probability
320 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
321 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
322 | (l,(ngt, ng)) <- lngs
323 ]
324 ------------------------------------------------------------------------
325
326
327 -- | Annuaire
328
329 flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
330 flowAnnuaire filePath = do
331 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
332 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
333 printDebug "length annuaire" ps
334
335
336 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
337 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
338 flowInsertAnnuaire name children = do
339
340 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
341 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
342
343 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
344 _ <- add userCorpusId (map reId ids)
345
346 printDebug "AnnuaireID" userCorpusId
347
348 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
349
350
351 subFlowAnnuaire :: HasNodeError err =>
352 Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
353 subFlowAnnuaire username _cName = do
354 maybeUserId <- getUser username
355
356 userId <- case maybeUserId of
357 Nothing -> nodeError NoUserFound
358 -- mk NodeUser gargantua_id "Node Gargantua"
359 Just user -> pure $ userLight_id user
360
361 rootId' <- map _node_id <$> getRoot username
362
363 rootId'' <- case rootId' of
364 [] -> mkRoot username userId
365 n -> case length n >= 2 of
366 True -> nodeError ManyNodeUsers
367 False -> pure rootId'
368 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
369
370 corpusId' <- mkAnnuaire rootId userId
371
372 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
373
374 printDebug "(username, userId, rootId, corpusId)"
375 (username, userId, rootId, corpusId)
376 pure (userId, rootId, corpusId)
377
378