]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[DB] NodesNgramsRepo.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17
18 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 where
20
21 --import Control.Lens (view)
22 import Control.Monad.IO.Class (liftIO)
23 --import Gargantext.Core.Types
24 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
25 import Data.Map (Map, lookup)
26 import Data.Maybe (Maybe(..), catMaybes)
27 import Data.Monoid
28 import Data.Text (Text, splitOn, intercalate)
29 import Data.Tuple.Extra (both)
30 import Data.List (concat)
31 import GHC.Show (Show)
32 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
33 import Gargantext.Core.Types.Individu (Username)
34 import Gargantext.Core.Types.Main
35 import Gargantext.Core (Lang(..))
36 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
37 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
38 import Gargantext.Database.Metrics.TFICF (getTficf)
39 import Gargantext.Text.Terms (extractTerms)
40 import Gargantext.Text.Metrics.TFICF (Tficf(..))
41 import Gargantext.Database.Node.Document.Add (add)
42 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
43 import Gargantext.Database.Root (getRoot)
44 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
45 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
46 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
47 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
48 import Gargantext.Database.Schema.User (getUser, UserLight(..))
49 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
50 import Gargantext.Database.Utils (Cmd, CmdM)
51 import Gargantext.Text.Terms (TermType(..))
52 import Gargantext.Ext.IMT (toSchoolName)
53 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
54 import Gargantext.Prelude
55 import Gargantext.Text.Parsers (parseDocs, FileFormat)
56 import System.FilePath (FilePath)
57
58 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
59
60 import qualified Data.Map as DM
61
62 type FlowCmdM env err m =
63 ( CmdM env err m
64 , RepoCmdM env err m
65 , HasNodeError err
66 )
67
68 flowCorpus :: FlowCmdM env err m => FileFormat -> FilePath -> CorpusName -> m CorpusId
69 flowCorpus ff fp cName = do
70 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
71 params <- flowInsert NodeCorpus hyperdataDocuments' cName
72 flowCorpus' NodeCorpus hyperdataDocuments' params
73
74
75 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
76 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
77 flowInsert _nt hyperdataDocuments cName = do
78 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
79
80 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
81 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
82
83 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
84 _ <- add userCorpusId (map reId ids)
85
86 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
87
88
89 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
90 flowAnnuaire filePath = do
91 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
92 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
93 printDebug "length annuaire" ps
94
95
96 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
97 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
98 flowInsertAnnuaire name children = do
99
100 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
101 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
102
103 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
104 _ <- add userCorpusId (map reId ids)
105
106 --printDebug "AnnuaireID" userCorpusId
107
108 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
109
110 -- TODO-ACCESS:
111 -- check userId CanFillUserCorpus userCorpusId
112 -- check masterUserId CanFillMasterCorpus masterCorpusId
113 --
114 -- TODO-EVENTS:
115 -- InsertedNgrams ?
116 -- InsertedNodeNgrams ?
117 flowCorpus' :: FlowCmdM env err m
118 => NodeType -> [HyperdataDocument]
119 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
120 -> m CorpusId
121 flowCorpus' NodeCorpus hyperdataDocuments (ids,_masterUserId,_masterCorpusId, userId,userCorpusId) = do
122 --------------------------------------------------
123 -- List Ngrams Flow
124 _userListId <- flowListUser userId userCorpusId 500
125 --printDebug "Working on User ListId : " userListId
126
127 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
128 -- printDebug "documentsWithId" documentsWithId
129 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
130 -- printDebug "docsWithNgrams" docsWithNgrams
131 let maps = mapNodeIdNgrams docsWithNgrams
132
133 -- printDebug "maps" (maps)
134 terms2id <- insertNgrams $ DM.keys maps
135 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
136 -- printDebug "inserted ngrams" indexedNgrams
137 _ <- insertToNodeNgrams indexedNgrams
138
139 --listId2 <- flowList masterUserId masterCorpusId indexedNgrams
140 --printDebug "Working on ListId : " listId2
141 --}
142 --------------------------------------------------
143 _ <- mkDashboard userCorpusId userId
144 _ <- mkGraph userCorpusId userId
145
146 -- Annuaire Flow
147 -- _ <- mkAnnuaire rootUserId userId
148
149 pure userCorpusId
150 -- del [corpusId2, corpusId]
151
152 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
153 flowCorpus' _ _ _ = undefined
154
155
156 type CorpusName = Text
157
158 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
159 subFlowCorpus username cName = do
160 maybeUserId <- getUser username
161
162 userId <- case maybeUserId of
163 Nothing -> nodeError NoUserFound
164 -- mk NodeUser gargantua_id "Node Gargantua"
165 Just user -> pure $ userLight_id user
166
167 rootId' <- map _node_id <$> getRoot username
168
169 rootId'' <- case rootId' of
170 [] -> mkRoot username userId
171 n -> case length n >= 2 of
172 True -> nodeError ManyNodeUsers
173 False -> pure rootId'
174 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
175
176 corpusId'' <- if username == userMaster
177 then do
178 ns <- getCorporaWithParentId rootId
179 pure $ map _node_id ns
180 else
181 pure []
182
183 corpusId' <- if corpusId'' /= []
184 then pure corpusId''
185 else mkCorpus (Just cName) Nothing rootId userId
186
187 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
188
189 --printDebug "(username, userId, rootId, corpusId)"
190 -- (username, userId, rootId, corpusId)
191 pure (userId, rootId, corpusId)
192
193
194 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
195 subFlowAnnuaire username _cName = do
196 maybeUserId <- getUser username
197
198 userId <- case maybeUserId of
199 Nothing -> nodeError NoUserFound
200 -- mk NodeUser gargantua_id "Node Gargantua"
201 Just user -> pure $ userLight_id user
202
203 rootId' <- map _node_id <$> getRoot username
204
205 rootId'' <- case rootId' of
206 [] -> mkRoot username userId
207 n -> case length n >= 2 of
208 True -> nodeError ManyNodeUsers
209 False -> pure rootId'
210 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
211
212 corpusId' <- mkAnnuaire rootId userId
213
214 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
215
216 --printDebug "(username, userId, rootId, corpusId)"
217 -- (username, userId, rootId, corpusId)
218 pure (userId, rootId, corpusId)
219
220 ------------------------------------------------------------------------
221 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
222 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
223 where
224 err = "Database.Flow.toInsert"
225
226 toInserted :: [ReturnId] -> Map HashId ReturnId
227 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
228 . filter (\r -> reInserted r == True)
229
230 data DocumentWithId =
231 DocumentWithId { documentId :: !NodeId
232 , documentData :: !HyperdataDocument
233 } deriving (Show)
234
235 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
236 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
237 where
238 toDocumentWithId (hash,hpd) =
239 DocumentWithId <$> fmap reId (lookup hash rs)
240 <*> Just hpd
241
242 ------------------------------------------------------------------------
243 data DocumentIdWithNgrams =
244 DocumentIdWithNgrams
245 { documentWithId :: !DocumentWithId
246 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
247 } deriving (Show)
248
249 -- TODO group terms
250 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
251 extractNgramsT doc = do
252 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
253 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
254 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
255 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
256 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
257
258 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
259 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
260 <> [(a', DM.singleton Authors 1) | a' <- authors ]
261 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
262
263
264
265 documentIdWithNgrams :: HasNodeError err
266 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
267 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
268 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
269 where
270 toDocumentIdWithNgrams d = do
271 e <- f $ documentData d
272 pure $ DocumentIdWithNgrams d e
273
274 -- | TODO check optimization
275 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
276 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
277 where
278 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
279 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
280 where
281 nId = documentId $ documentWithId d
282
283 ------------------------------------------------------------------------
284 flowList :: HasNodeError err => UserId -> CorpusId
285 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
286 flowList uId cId _ngs = do
287 -- printDebug "ngs:" ngs
288 lId <- getOrMkList cId uId
289 --printDebug "ngs" (DM.keys ngs)
290 -- TODO add stemming equivalence of 2 ngrams
291 -- TODO needs rework
292 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
293 -- _ <- insertGroups lId groupEd
294
295 -- compute Candidate / Map
296 --is <- insertLists lId $ ngrams2list ngs
297 --printDebug "listNgrams inserted :" is
298
299 pure lId
300
301 flowListUser :: FlowCmdM env err m
302 => UserId -> CorpusId -> Int -> m ListId
303 flowListUser uId cId n = do
304 lId <- getOrMkList cId uId
305 -- is <- insertLists lId $ ngrams2list ngs
306
307 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
308 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
309
310 insertNewListOfNgramsElements lId NgramsTerms $
311 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
312 | ng <- ngs ]
313
314 pure lId
315
316 ------------------------------------------------------------------------
317
318 {-
319 TODO rework:
320 * quadratic
321 * DM.keys called twice
322 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
323 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
324 -> Map NgramsIndexed NgramsIndexed
325 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
326 -}
327
328
329 -- TODO check: do not insert duplicates
330 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
331 insertGroups lId ngrs =
332 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
333 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
334 , ng1 /= ng2
335 ]
336
337 ------------------------------------------------------------------------
338 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
339 -> [(ListType, (NgramsType,NgramsIndexed))]
340 ngrams2list m =
341 [ (CandidateList, (t, ng))
342 | (ng, tm) <- DM.toList m
343 , t <- DM.keys tm
344 ]
345
346 -- | TODO: weight of the list could be a probability
347 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
348 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
349 | (l,(ngt, ng)) <- lngs
350 ]
351 ------------------------------------------------------------------------