]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[READING] proposal: keeping data consistency with database.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
18 where
19
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Monoid
27 import Data.Text (Text, splitOn, intercalate)
28 import Data.Tuple.Extra (both)
29 import Data.List (concat)
30 import GHC.Show (Show)
31 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
32 import Gargantext.Core.Types.Individu (Username)
33 import Gargantext.Core.Types.Main
34 import Gargantext.Core (Lang(..))
35 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
36 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
37 import Gargantext.Database.Metrics.TFICF (getTficf)
38 import Gargantext.Text.Terms (extractTerms)
39 import Gargantext.Text.Metrics.TFICF (Tficf(..))
40 import Gargantext.Database.Node.Document.Add (add)
41 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
42 import Gargantext.Database.Root (getRoot)
43 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
44 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
45 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
46 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
47 import Gargantext.Database.Schema.User (getUser, UserLight(..))
48 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
49 import Gargantext.Database.Utils (Cmd)
50 import Gargantext.Text.Terms (TermType(..))
51 import Gargantext.Ext.IMT (toSchoolName)
52 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
53 import Gargantext.Prelude
54 import Gargantext.Text.Parsers (parseDocs, FileFormat)
55 import System.FilePath (FilePath)
56
57 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
58
59 import qualified Data.Map as DM
60
61
62 flowCorpus :: RepoCmdM env err m => FileFormat -> FilePath -> CorpusName -> m CorpusId
63 flowCorpus ff fp cName = do
64 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
65 params <- flowInsert NodeCorpus hyperdataDocuments' cName
66 flowCorpus' NodeCorpus hyperdataDocuments' params
67
68
69 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
70 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
71 flowInsert _nt hyperdataDocuments cName = do
72 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
73
74 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
75 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
76
77 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
78 _ <- add userCorpusId (map reId ids)
79
80 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
81
82
83 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
84 flowAnnuaire filePath = do
85 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
86 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
87 printDebug "length annuaire" ps
88
89
90 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
91 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
92 flowInsertAnnuaire name children = do
93
94 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
95 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
96
97 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
98 _ <- add userCorpusId (map reId ids)
99
100 --printDebug "AnnuaireID" userCorpusId
101
102 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
103
104 -- TODO-ACCESS:
105 -- check userId CanFillUserCorpus userCorpusId
106 -- check masterUserId CanFillMasterCorpus masterCorpusId
107 --
108 -- TODO-EVENTS:
109 -- InsertedNgrams ?
110 -- InsertedNodeNgrams ?
111 flowCorpus' :: RepoCmdM env err m
112 => NodeType -> [HyperdataDocument]
113 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
114 -> m CorpusId
115 flowCorpus' NodeCorpus hyperdataDocuments (ids,_masterUserId,_masterCorpusId, userId,userCorpusId) = do
116 --------------------------------------------------
117 -- List Ngrams Flow
118 _userListId <- flowListUser userId userCorpusId 500
119 --printDebug "Working on User ListId : " userListId
120
121 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
122 -- printDebug "documentsWithId" documentsWithId
123 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
124 -- printDebug "docsWithNgrams" docsWithNgrams
125 let maps = mapNodeIdNgrams docsWithNgrams
126
127 -- printDebug "maps" (maps)
128 terms2id <- insertNgrams $ DM.keys maps
129 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
130 -- printDebug "inserted ngrams" indexedNgrams
131 _ <- insertToNodeNgrams indexedNgrams
132
133 --listId2 <- flowList masterUserId masterCorpusId indexedNgrams
134 --printDebug "Working on ListId : " listId2
135 --}
136 --------------------------------------------------
137 _ <- mkDashboard userCorpusId userId
138 _ <- mkGraph userCorpusId userId
139
140 -- Annuaire Flow
141 -- _ <- mkAnnuaire rootUserId userId
142
143 pure userCorpusId
144 -- del [corpusId2, corpusId]
145
146 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
147 flowCorpus' _ _ _ = undefined
148
149
150 type CorpusName = Text
151
152 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
153 subFlowCorpus username cName = do
154 maybeUserId <- getUser username
155
156 userId <- case maybeUserId of
157 Nothing -> nodeError NoUserFound
158 -- mk NodeUser gargantua_id "Node Gargantua"
159 Just user -> pure $ userLight_id user
160
161 rootId' <- map _node_id <$> getRoot username
162
163 rootId'' <- case rootId' of
164 [] -> mkRoot username userId
165 n -> case length n >= 2 of
166 True -> nodeError ManyNodeUsers
167 False -> pure rootId'
168 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
169
170 corpusId'' <- if username == userMaster
171 then do
172 ns <- getCorporaWithParentId rootId
173 pure $ map _node_id ns
174 else
175 pure []
176
177 corpusId' <- if corpusId'' /= []
178 then pure corpusId''
179 else mkCorpus (Just cName) Nothing rootId userId
180
181 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
182
183 --printDebug "(username, userId, rootId, corpusId)"
184 -- (username, userId, rootId, corpusId)
185 pure (userId, rootId, corpusId)
186
187
188 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
189 subFlowAnnuaire username _cName = do
190 maybeUserId <- getUser username
191
192 userId <- case maybeUserId of
193 Nothing -> nodeError NoUserFound
194 -- mk NodeUser gargantua_id "Node Gargantua"
195 Just user -> pure $ userLight_id user
196
197 rootId' <- map _node_id <$> getRoot username
198
199 rootId'' <- case rootId' of
200 [] -> mkRoot username userId
201 n -> case length n >= 2 of
202 True -> nodeError ManyNodeUsers
203 False -> pure rootId'
204 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
205
206 corpusId' <- mkAnnuaire rootId userId
207
208 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
209
210 --printDebug "(username, userId, rootId, corpusId)"
211 -- (username, userId, rootId, corpusId)
212 pure (userId, rootId, corpusId)
213
214 ------------------------------------------------------------------------
215 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
216 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
217 where
218 err = "Database.Flow.toInsert"
219
220 toInserted :: [ReturnId] -> Map HashId ReturnId
221 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
222 . filter (\r -> reInserted r == True)
223
224 data DocumentWithId =
225 DocumentWithId { documentId :: !NodeId
226 , documentData :: !HyperdataDocument
227 } deriving (Show)
228
229 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
230 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
231 where
232 toDocumentWithId (hash,hpd) =
233 DocumentWithId <$> fmap reId (lookup hash rs)
234 <*> Just hpd
235
236 ------------------------------------------------------------------------
237 data DocumentIdWithNgrams =
238 DocumentIdWithNgrams
239 { documentWithId :: !DocumentWithId
240 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
241 } deriving (Show)
242
243 -- TODO group terms
244 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
245 extractNgramsT doc = do
246 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
247 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
248 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
249 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
250 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
251
252 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
253 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
254 <> [(a', DM.singleton Authors 1) | a' <- authors ]
255 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
256
257
258
259 documentIdWithNgrams :: HasNodeError err
260 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
261 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
262 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
263 where
264 toDocumentIdWithNgrams d = do
265 e <- f $ documentData d
266 pure $ DocumentIdWithNgrams d e
267
268 -- | TODO check optimization
269 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
270 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
271 where
272 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
273 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
274 where
275 nId = documentId $ documentWithId d
276
277 ------------------------------------------------------------------------
278 flowList :: HasNodeError err => UserId -> CorpusId
279 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
280 flowList uId cId _ngs = do
281 -- printDebug "ngs:" ngs
282 lId <- getOrMkList cId uId
283 --printDebug "ngs" (DM.keys ngs)
284 -- TODO add stemming equivalence of 2 ngrams
285 -- TODO needs rework
286 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
287 -- _ <- insertGroups lId groupEd
288
289 -- compute Candidate / Map
290 --is <- insertLists lId $ ngrams2list ngs
291 --printDebug "listNgrams inserted :" is
292
293 pure lId
294
295 flowListUser :: RepoCmdM env err m
296 => UserId -> CorpusId -> Int -> m ListId
297 flowListUser uId cId n = do
298 lId <- getOrMkList cId uId
299 -- is <- insertLists lId $ ngrams2list ngs
300
301 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
302 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
303
304 insertNewListOfNgramsElements lId $
305 DM.singleton NgramsTerms
306 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
307 | ng <- ngs ]
308
309 pure lId
310
311 ------------------------------------------------------------------------
312
313 {-
314 TODO rework:
315 * quadratic
316 * DM.keys called twice
317 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
318 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
319 -> Map NgramsIndexed NgramsIndexed
320 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
321 -}
322
323
324 -- TODO check: do not insert duplicates
325 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
326 insertGroups lId ngrs =
327 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
328 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
329 , ng1 /= ng2
330 ]
331
332 ------------------------------------------------------------------------
333 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
334 -> [(ListType, (NgramsType,NgramsIndexed))]
335 ngrams2list m =
336 [ (CandidateList, (t, ng))
337 | (ng, tm) <- DM.toList m
338 , t <- DM.keys tm
339 ]
340
341 -- | TODO: weight of the list could be a probability
342 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
343 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
344 | (l,(ngt, ng)) <- lngs
345 ]
346 ------------------------------------------------------------------------
347