]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ConstraintKinds #-}
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE RankNTypes #-}
17 {-# LANGUAGE FlexibleContexts #-}
18
19 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 where
21
22 --import Control.Lens (view)
23 import Control.Monad.IO.Class (liftIO)
24 --import Gargantext.Core.Types
25 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
26 import Data.Map (Map, lookup)
27 import Data.Maybe (Maybe(..), catMaybes)
28 import Data.Monoid
29 import Data.Text (Text, splitOn, intercalate)
30 import Data.Tuple.Extra (both)
31 import Data.List (concat)
32 import GHC.Show (Show)
33 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
34 import Gargantext.Core.Types.Individu (Username)
35 import Gargantext.Core.Types.Main
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
38 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
39 import Gargantext.Database.Metrics.TFICF (getTficf)
40 import Gargantext.Text.Terms (extractTerms)
41 import Gargantext.Text.Metrics.TFICF (Tficf(..))
42 import Gargantext.Database.Node.Document.Add (add)
43 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
44 import Gargantext.Database.Root (getRoot)
45 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
46 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
47 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
48 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
49 import Gargantext.Database.Schema.User (getUser, UserLight(..))
50 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
51 import Gargantext.Database.Utils (Cmd, CmdM)
52 import Gargantext.Text.Terms (TermType(..))
53 import Gargantext.Ext.IMT (toSchoolName)
54 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
55 import Gargantext.Prelude
56 import Gargantext.Text.Parsers (parseDocs, FileFormat)
57 import System.FilePath (FilePath)
58 import Gargantext.API.Ngrams (HasRepoVar)
59 import Servant (ServantErr)
60 import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
61 --import Gargantext.Database.Schema.User (insertUsers, simpleUser, gargantuaUser)
62 import qualified Data.Map as DM
63
64 type FlowCmdM env err m =
65 ( CmdM env err m
66 , RepoCmdM env err m
67 , HasNodeError err
68 , HasRepoVar env
69 )
70
71
72 flowCorpus :: FlowCmdM env ServantErr m
73 => FileFormat -> FilePath -> CorpusName -> m CorpusId
74 flowCorpus ff fp cName = do
75 --insertUsers [gargantuaUser, simpleUser]
76 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
77 params <- flowInsert NodeCorpus hyperdataDocuments' cName
78 flowCorpus' NodeCorpus hyperdataDocuments' params
79
80
81 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
82 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
83 flowInsert _nt hyperdataDocuments cName = do
84 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
85
86 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
87 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
88
89 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
90 _ <- add userCorpusId (map reId ids)
91
92 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
93
94
95 flowAnnuaire :: FlowCmdM env ServantErr m => FilePath -> m ()
96 flowAnnuaire filePath = do
97 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
98 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
99 printDebug "length annuaire" ps
100
101
102 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
103 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
104 flowInsertAnnuaire name children = do
105
106 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
107 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
108
109 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
110 _ <- add userCorpusId (map reId ids)
111
112 printDebug "AnnuaireID" userCorpusId
113
114 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
115
116 -- TODO-ACCESS:
117 -- check userId CanFillUserCorpus userCorpusId
118 -- check masterUserId CanFillMasterCorpus masterCorpusId
119 --
120 -- TODO-EVENTS:
121 -- InsertedNgrams ?
122 -- InsertedNodeNgrams ?
123 flowCorpus' :: FlowCmdM env err m
124 => NodeType -> [HyperdataDocument]
125 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
126 -> m CorpusId
127 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
128 --------------------------------------------------
129 -- List Ngrams Flow
130 userListId <- flowListUser userId userCorpusId 500
131 printDebug "Working on User ListId : " userListId
132
133 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
134 printDebug "documentsWithId" documentsWithId
135 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
136 printDebug "docsWithNgrams" docsWithNgrams
137 let maps = mapNodeIdNgrams docsWithNgrams
138
139 -- printDebug "maps" (maps)
140 terms2id <- insertNgrams $ DM.keys maps
141 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
142 printDebug "inserted ngrams" indexedNgrams
143 _ <- insertToNodeNgrams indexedNgrams
144
145 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
146 printDebug "Working on ListId : " listId2
147 --}
148 --------------------------------------------------
149 _ <- mkDashboard userCorpusId userId
150 _ <- mkGraph userCorpusId userId
151
152 -- Annuaire Flow
153 -- _ <- mkAnnuaire rootUserId userId
154
155 pure userCorpusId
156 -- del [corpusId2, corpusId]
157
158 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
159 flowCorpus' _ _ _ = undefined
160
161
162 type CorpusName = Text
163
164 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
165 subFlowCorpus username cName = do
166 maybeUserId <- getUser username
167 userId <- case maybeUserId of
168 Nothing -> nodeError NoUserFound
169 -- mk NodeUser gargantua_id "Node Gargantua"
170 Just user -> pure $ userLight_id user
171
172 printDebug "userId" userId
173 rootId' <- map _node_id <$> getRoot username
174
175 printDebug "rootId'" rootId'
176 rootId'' <- case rootId' of
177 [] -> mkRoot username userId
178 n -> case length n >= 2 of
179 True -> nodeError ManyNodeUsers
180 False -> pure rootId'
181
182 printDebug "rootId''" rootId''
183 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
184
185 corpusId'' <- if username == userMaster
186 then do
187 ns <- getCorporaWithParentId rootId
188 pure $ map _node_id ns
189 else
190 pure []
191
192 corpusId' <- if corpusId'' /= []
193 then pure corpusId''
194 else mkCorpus (Just cName) Nothing rootId userId
195
196 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
197
198 printDebug "(username, userId, rootId, corpusId)"
199 (username, userId, rootId, corpusId)
200 pure (userId, rootId, corpusId)
201
202
203 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
204 subFlowAnnuaire username _cName = do
205 maybeUserId <- getUser username
206
207 userId <- case maybeUserId of
208 Nothing -> nodeError NoUserFound
209 -- mk NodeUser gargantua_id "Node Gargantua"
210 Just user -> pure $ userLight_id user
211
212 rootId' <- map _node_id <$> getRoot username
213
214 rootId'' <- case rootId' of
215 [] -> mkRoot username userId
216 n -> case length n >= 2 of
217 True -> nodeError ManyNodeUsers
218 False -> pure rootId'
219 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
220
221 corpusId' <- mkAnnuaire rootId userId
222
223 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
224
225 printDebug "(username, userId, rootId, corpusId)"
226 (username, userId, rootId, corpusId)
227 pure (userId, rootId, corpusId)
228
229 ------------------------------------------------------------------------
230 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
231 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
232 where
233 err = "Database.Flow.toInsert"
234
235 toInserted :: [ReturnId] -> Map HashId ReturnId
236 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
237 . filter (\r -> reInserted r == True)
238
239 data DocumentWithId =
240 DocumentWithId { documentId :: !NodeId
241 , documentData :: !HyperdataDocument
242 } deriving (Show)
243
244 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
245 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
246 where
247 toDocumentWithId (hash,hpd) =
248 DocumentWithId <$> fmap reId (lookup hash rs)
249 <*> Just hpd
250
251 ------------------------------------------------------------------------
252 data DocumentIdWithNgrams =
253 DocumentIdWithNgrams
254 { documentWithId :: !DocumentWithId
255 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
256 } deriving (Show)
257
258 -- TODO group terms
259 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
260 extractNgramsT doc = do
261 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
262 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
263 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
264 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
265 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
266
267 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
268 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
269 <> [(a', DM.singleton Authors 1) | a' <- authors ]
270 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
271
272
273
274 documentIdWithNgrams :: HasNodeError err
275 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
276 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
277 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
278 where
279 toDocumentIdWithNgrams d = do
280 e <- f $ documentData d
281 pure $ DocumentIdWithNgrams d e
282
283 -- | TODO check optimization
284 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
285 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
286 where
287 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
288 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
289 where
290 nId = documentId $ documentWithId d
291
292 ------------------------------------------------------------------------
293 flowList :: HasNodeError err => UserId -> CorpusId
294 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
295 flowList uId cId ngs = do
296 -- printDebug "ngs:" ngs
297 lId <- getOrMkList cId uId
298 printDebug "ngs" (DM.keys ngs)
299 -- TODO add stemming equivalence of 2 ngrams
300 -- TODO needs rework
301 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
302 -- _ <- insertGroups lId groupEd
303
304 -- compute Candidate / Map
305 is <- insertLists lId $ ngrams2list ngs
306 printDebug "listNgrams inserted :" is
307
308 pure lId
309
310 flowListUser :: FlowCmdM env err m
311 => UserId -> CorpusId -> Int -> m ListId
312 flowListUser uId cId n = do
313 lId <- getOrMkList cId uId
314 -- is <- insertLists lId $ ngrams2list ngs
315
316 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
317 -- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
318
319 insertNewListOfNgramsElements lId NgramsTerms $
320 [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
321 | ng <- ngs ]
322
323 pure lId
324
325 ------------------------------------------------------------------------
326
327 {-
328 TODO rework:
329 * quadratic
330 * DM.keys called twice
331 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
332 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
333 -> Map NgramsIndexed NgramsIndexed
334 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
335 -}
336
337
338 -- TODO check: do not insert duplicates
339 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
340 insertGroups lId ngrs =
341 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
342 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
343 , ng1 /= ng2
344 ]
345
346 ------------------------------------------------------------------------
347 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
348 -> [(ListType, (NgramsType,NgramsIndexed))]
349 ngrams2list m =
350 [ (CandidateList, (t, ng))
351 | (ng, tm) <- DM.toList m
352 , t <- DM.keys tm
353 ]
354
355 -- | TODO: weight of the list could be a probability
356 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
357 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
358 | (l,(ngt, ng)) <- lngs
359 ]
360 ------------------------------------------------------------------------