]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FIX] HyperdataList Arbitrary needs a fix.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17 {-# LANGUAGE FlexibleContexts #-}
18
19 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 where
21
22 --import Control.Lens (view)
23 import Control.Monad.IO.Class (liftIO)
24 --import Gargantext.Core.Types
25 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
26 import Data.Map (Map, lookup)
27 import Data.Maybe (Maybe(..), catMaybes)
28 import Data.Monoid
29 import Data.Text (Text, splitOn, intercalate)
30 import Data.Tuple.Extra (both)
31 import Data.List (concat)
32 import GHC.Show (Show)
33 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
34 import Gargantext.Core.Types.Individu (Username)
35 import Gargantext.Core.Types.Main
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
38 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
39 import Gargantext.Database.Metrics.TFICF (getTficf)
40 import Gargantext.Text.Terms (extractTerms)
41 import Gargantext.Text.Metrics.TFICF (Tficf(..))
42 import Gargantext.Database.Node.Document.Add (add)
43 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
46 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
47 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
48 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
49 import Gargantext.Database.Schema.User (getUser, UserLight(..))
50 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
51 import Gargantext.Database.Utils (Cmd, CmdM)
52 import Gargantext.Text.Terms (TermType(..))
53 import Gargantext.Ext.IMT (toSchoolName)
54 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
55 import Gargantext.Prelude
56 import Gargantext.Text.Parsers (parseDocs, FileFormat)
57 import System.FilePath (FilePath)
58 import Gargantext.API.Ngrams (HasRepoVar)
59 import Servant (ServantErr)
60 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
61 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
62 import qualified Data.Map as DM
63
64 type FlowCmdM env err m =
65 ( CmdM env err m
66 , RepoCmdM env err m
67 , HasNodeError err
68 , HasRepoVar env
69 )
70
71
72 flowCorpus :: FlowCmdM env ServantErr m
73 => FileFormat -> FilePath -> CorpusName -> m CorpusId
74 flowCorpus ff fp cName = do
75 --insertUsers [gargantuaUser, simpleUser]
76 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
77 params <- flowInsert NodeCorpus hyperdataDocuments' cName
78 flowCorpus' NodeCorpus hyperdataDocuments' params
79
80
81 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
82 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
83 flowInsert _nt hyperdataDocuments cName = do
84 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
85
86 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
87 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
88
89 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
90 _ <- add userCorpusId (map reId ids)
91
92 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
93
94
95 flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
96 flowAnnuaire filePath = do
97 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
98 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
99 printDebug "length annuaire" ps
100
101
102 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
103 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
104 flowInsertAnnuaire name children = do
105
106 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
107 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
108
109 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
110 _ <- add userCorpusId (map reId ids)
111
112 printDebug "AnnuaireID" userCorpusId
113
114 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
115
116 -- TODO-ACCESS:
117 -- check userId CanFillUserCorpus userCorpusId
118 -- check masterUserId CanFillMasterCorpus masterCorpusId
119 --
120 -- TODO-EVENTS:
121 -- InsertedNgrams ?
122 -- InsertedNodeNgrams ?
123 flowCorpus' :: FlowCmdM env err m
124 => NodeType -> [HyperdataDocument]
125 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
126 -> m CorpusId
127 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
128 --------------------------------------------------
129 -- List Ngrams Flow
130 userListId <- flowListUser userId userCorpusId 500
131 printDebug "Working on User ListId : " userListId
132
133 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
134 printDebug "documentsWithId" documentsWithId
135 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
136 printDebug "docsWithNgrams" docsWithNgrams
137 let maps = mapNodeIdNgrams docsWithNgrams
138
139 -- printDebug "maps" (maps)
140 terms2id <- insertNgrams $ DM.keys maps
141 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
142 printDebug "inserted ngrams" indexedNgrams
143 _ <- insertToNodeNgrams indexedNgrams
144
145 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
146 printDebug "Working on ListId : " listId2
147 --}
148 --------------------------------------------------
149 _ <- mkDashboard userCorpusId userId
150 _ <- mkGraph userCorpusId userId
151
152 -- Annuaire Flow
153 -- _ <- mkAnnuaire rootUserId userId
154
155 pure userCorpusId
156 -- del [corpusId2, corpusId]
157
158 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
159 flowCorpus' _ _ _ = undefined
160
161
162 type CorpusName = Text
163
164 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
165 subFlowCorpus username cName = do
166 maybeUserId <- getUser username
167 userId <- case maybeUserId of
168 Nothing -> nodeError NoUserFound
169 -- mk NodeUser gargantua_id "Node Gargantua"
170 Just user -> pure $ userLight_id user
171
172 printDebug "userId" userId
173 rootId' <- map _node_id <$> getRoot username
174
175 printDebug "rootId'" rootId'
176 rootId'' <- case rootId' of
177 [] -> mkRoot username userId
178 n -> case length n >= 2 of
179 True -> nodeError ManyNodeUsers
180 False -> pure rootId'
181
182 printDebug "rootId''" rootId''
183 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
184
185 corpusId'' <- if username == userMaster
186 then do
187 ns <- getCorporaWithParentId rootId
188 pure $ map _node_id ns
189 else
190 pure []
191
192 corpusId' <- if corpusId'' /= []
193 then pure corpusId''
194 else mkCorpus (Just cName) Nothing rootId userId
195
196 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
197
198 printDebug "(username, userId, rootId, corpusId)"
199 (username, userId, rootId, corpusId)
200 pure (userId, rootId, corpusId)
201
202
203 subFlowAnnuaire :: HasNodeError err =>
204 Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
205 subFlowAnnuaire username _cName = do
206 maybeUserId <- getUser username
207
208 userId <- case maybeUserId of
209 Nothing -> nodeError NoUserFound
210 -- mk NodeUser gargantua_id "Node Gargantua"
211 Just user -> pure $ userLight_id user
212
213 rootId' <- map _node_id <$> getRoot username
214
215 rootId'' <- case rootId' of
216 [] -> mkRoot username userId
217 n -> case length n >= 2 of
218 True -> nodeError ManyNodeUsers
219 False -> pure rootId'
220 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
221
222 corpusId' <- mkAnnuaire rootId userId
223
224 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
225
226 printDebug "(username, userId, rootId, corpusId)"
227 (username, userId, rootId, corpusId)
228 pure (userId, rootId, corpusId)
229
230 ------------------------------------------------------------------------
231 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
232 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
233 where
234 err = "Database.Flow.toInsert"
235
236 toInserted :: [ReturnId] -> Map HashId ReturnId
237 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
238 . filter (\r -> reInserted r == True)
239
240 data DocumentWithId =
241 DocumentWithId { documentId :: !NodeId
242 , documentData :: !HyperdataDocument
243 } deriving (Show)
244
245 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
246 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
247 where
248 toDocumentWithId (hash,hpd) =
249 DocumentWithId <$> fmap reId (lookup hash rs)
250 <*> Just hpd
251
252 ------------------------------------------------------------------------
253 data DocumentIdWithNgrams =
254 DocumentIdWithNgrams
255 { documentWithId :: !DocumentWithId
256 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
257 } deriving (Show)
258
259 -- TODO group terms
260 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
261 extractNgramsT doc = do
262 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
263 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
264 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
265 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
266 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
267
268 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
269 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
270 <> [(a', DM.singleton Authors 1) | a' <- authors ]
271 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
272
273
274
275 documentIdWithNgrams :: HasNodeError err
276 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
277 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
278 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
279 where
280 toDocumentIdWithNgrams d = do
281 e <- f $ documentData d
282 pure $ DocumentIdWithNgrams d e
283
284 -- | TODO check optimization
285 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
286 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
287 where
288 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
289 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
290 where
291 nId = documentId $ documentWithId d
292
293 ------------------------------------------------------------------------
294 flowList :: HasNodeError err => UserId -> CorpusId
295 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
296 flowList uId cId ngs = do
297 -- printDebug "ngs:" ngs
298 lId <- getOrMkList cId uId
299 printDebug "ngs" (DM.keys ngs)
300 -- TODO add stemming equivalence of 2 ngrams
301 -- TODO needs rework
302 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
303 -- _ <- insertGroups lId groupEd
304
305 -- compute Candidate / Map
306 is <- insertLists lId $ ngrams2list ngs
307 printDebug "listNgrams inserted :" is
308
309 pure lId
310
311 flowListUser :: FlowCmdM env err m
312 => UserId -> CorpusId -> Int -> m ListId
313 flowListUser uId cId n = do
314 lId <- getOrMkList cId uId
315 -- is <- insertLists lId $ ngrams2list ngs
316
317 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
318 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
319
320 insertNewListOfNgramsElements lId NgramsTerms $
321 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
322 | ng <- ngs ]
323
324 pure lId
325
326 ------------------------------------------------------------------------
327
328 {-
329 TODO rework:
330 * quadratic
331 * DM.keys called twice
332 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
333 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
334 -> Map NgramsIndexed NgramsIndexed
335 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
336 -}
337
338
339 -- TODO check: do not insert duplicates
340 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
341 insertGroups lId ngrs =
342 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
343 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
344 , ng1 /= ng2
345 ]
346
347 ------------------------------------------------------------------------
348 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
349 -> [(ListType, (NgramsType,NgramsIndexed))]
350 ngrams2list m =
351 [ (CandidateList, (t, ng))
352 | (ng, tm) <- DM.toList m
353 , t <- DM.keys tm
354 ]
355
356 -- | TODO: weight of the list could be a probability
357 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
358 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
359 | (l,(ngt, ng)) <- lngs
360 ]
361 ------------------------------------------------------------------------