]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[WIP] Connecting graph.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
18 where
19
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Database.Metrics.TFICF (getTficf)
37 import Gargantext.Text.Terms (extractTerms)
38 import Gargantext.Text.Metrics.TFICF (Tficf(..))
39 import Gargantext.Database.Node.Document.Add (add)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
41 import Gargantext.Database.Root (getRoot)
42 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
43 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
44 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
45 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
46 import Gargantext.Database.Schema.User (getUser, UserLight(..))
47 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
48 import Gargantext.Database.Utils (Cmd)
49 import Gargantext.Text.Terms (TermType(..))
50 import Gargantext.Ext.IMT (toSchoolName)
51 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
52 import Gargantext.Prelude
53 import Gargantext.Text.Parsers (parseDocs, FileFormat)
54 import System.FilePath (FilePath)
55 import qualified Data.Map as DM
56
57
58 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
59 flowCorpus ff fp cName = do
60 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
61 params <- flowInsert NodeCorpus hyperdataDocuments' cName
62 flowCorpus' NodeCorpus hyperdataDocuments' params
63
64
65 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
66 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
67 flowInsert _nt hyperdataDocuments cName = do
68 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
69
70 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
71 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
72
73 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
74 _ <- add userCorpusId (map reId ids)
75
76 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
77
78
79 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
80 flowAnnuaire filePath = do
81 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
82 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
83 printDebug "length annuaire" ps
84
85
86 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
87 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
88 flowInsertAnnuaire name children = do
89
90 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
91 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
92
93 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
94 _ <- add userCorpusId (map reId ids)
95
96 printDebug "AnnuaireID" userCorpusId
97
98 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
99
100
101 flowCorpus' :: HasNodeError err
102 => NodeType -> [HyperdataDocument]
103 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
104 -> Cmd err CorpusId
105 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
106 --------------------------------------------------
107 -- List Ngrams Flow
108 userListId <- flowListUser userId userCorpusId 300
109 printDebug "Working on User ListId : " userListId
110
111 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
112 -- printDebug "documentsWithId" documentsWithId
113 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
114 -- printDebug "docsWithNgrams" docsWithNgrams
115 let maps = mapNodeIdNgrams docsWithNgrams
116
117 -- printDebug "maps" (maps)
118 terms2id <- insertNgrams $ DM.keys maps
119 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
120 -- printDebug "inserted ngrams" indexedNgrams
121 _ <- insertToNodeNgrams indexedNgrams
122
123 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
124 printDebug "Working on ListId : " listId2
125 --}
126 --------------------------------------------------
127 _ <- mkDashboard userCorpusId userId
128 _ <- mkGraph userCorpusId userId
129
130 -- Annuaire Flow
131 -- _ <- mkAnnuaire rootUserId userId
132
133 pure userCorpusId
134 -- del [corpusId2, corpusId]
135
136 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
137 flowCorpus' _ _ _ = undefined
138
139
140 type CorpusName = Text
141
142 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
143 subFlowCorpus username cName = do
144 maybeUserId <- getUser username
145
146 userId <- case maybeUserId of
147 Nothing -> nodeError NoUserFound
148 -- mk NodeUser gargantua_id "Node Gargantua"
149 Just user -> pure $ userLight_id user
150
151 rootId' <- map _node_id <$> getRoot username
152
153 rootId'' <- case rootId' of
154 [] -> mkRoot username userId
155 n -> case length n >= 2 of
156 True -> nodeError ManyNodeUsers
157 False -> pure rootId'
158 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
159
160 corpusId'' <- if username == userMaster
161 then do
162 ns <- getCorporaWithParentId rootId
163 pure $ map _node_id ns
164 else
165 pure []
166
167 corpusId' <- if corpusId'' /= []
168 then pure corpusId''
169 else mkCorpus (Just cName) Nothing rootId userId
170
171 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
172
173 printDebug "(username, userId, rootId, corpusId)"
174 (username, userId, rootId, corpusId)
175 pure (userId, rootId, corpusId)
176
177
178 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
179 subFlowAnnuaire username _cName = do
180 maybeUserId <- getUser username
181
182 userId <- case maybeUserId of
183 Nothing -> nodeError NoUserFound
184 -- mk NodeUser gargantua_id "Node Gargantua"
185 Just user -> pure $ userLight_id user
186
187 rootId' <- map _node_id <$> getRoot username
188
189 rootId'' <- case rootId' of
190 [] -> mkRoot username userId
191 n -> case length n >= 2 of
192 True -> nodeError ManyNodeUsers
193 False -> pure rootId'
194 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
195
196 corpusId' <- mkAnnuaire rootId userId
197
198 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
199
200 printDebug "(username, userId, rootId, corpusId)"
201 (username, userId, rootId, corpusId)
202 pure (userId, rootId, corpusId)
203
204 ------------------------------------------------------------------------
205 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
206 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
207 where
208 err = "Database.Flow.toInsert"
209
210 toInserted :: [ReturnId] -> Map HashId ReturnId
211 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
212 . filter (\r -> reInserted r == True)
213
214 data DocumentWithId =
215 DocumentWithId { documentId :: !NodeId
216 , documentData :: !HyperdataDocument
217 } deriving (Show)
218
219 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
220 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
221 where
222 toDocumentWithId (hash,hpd) =
223 DocumentWithId <$> fmap reId (lookup hash rs)
224 <*> Just hpd
225
226 ------------------------------------------------------------------------
227 data DocumentIdWithNgrams =
228 DocumentIdWithNgrams
229 { documentWithId :: !DocumentWithId
230 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
231 } deriving (Show)
232
233 -- TODO group terms
234 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
235 extractNgramsT doc = do
236 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
237 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
238 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
239 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
240 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
241
242 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
243 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
244 <> [(a', DM.singleton Authors 1) | a' <- authors ]
245 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
246
247
248
249 documentIdWithNgrams :: HasNodeError err
250 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
251 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
252 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
253 where
254 toDocumentIdWithNgrams d = do
255 e <- f $ documentData d
256 pure $ DocumentIdWithNgrams d e
257
258 -- | TODO check optimization
259 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
260 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
261 where
262 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
263 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
264 where
265 nId = documentId $ documentWithId d
266
267 ------------------------------------------------------------------------
268 flowList :: HasNodeError err => UserId -> CorpusId
269 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
270 flowList uId cId ngs = do
271 -- printDebug "ngs:" ngs
272 lId <- getOrMkList cId uId
273 --printDebug "ngs" (DM.keys ngs)
274 -- TODO add stemming equivalence of 2 ngrams
275 -- TODO needs rework
276 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
277 -- _ <- insertGroups lId groupEd
278
279 -- compute Candidate / Map
280 is <- insertLists lId $ ngrams2list ngs
281 printDebug "listNgrams inserted :" is
282
283 pure lId
284
285 flowListUser :: HasNodeError err => UserId -> CorpusId -> Int -> Cmd err NodeId
286 flowListUser uId cId n = do
287 lId <- getOrMkList cId uId
288 -- is <- insertLists lId $ ngrams2list ngs
289
290 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
291 _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
292
293 pure lId
294
295 ------------------------------------------------------------------------
296
297 {-
298 TODO rework:
299 * quadratic
300 * DM.keys called twice
301 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
302 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
303 -> Map NgramsIndexed NgramsIndexed
304 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
305 -}
306
307
308 -- TODO check: do not insert duplicates
309 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
310 insertGroups lId ngrs =
311 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
312 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
313 , ng1 /= ng2
314 ]
315
316 ------------------------------------------------------------------------
317 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
318 -> [(ListType, (NgramsType,NgramsIndexed))]
319 ngrams2list m =
320 [ (CandidateList, (t, ng))
321 | (ng, tm) <- DM.toList m
322 , t <- DM.keys tm
323 ]
324
325 -- | TODO: weight of the list could be a probability
326 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
327 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
328 | (l,(ngt, ng)) <- lngs
329 ]
330 ------------------------------------------------------------------------
331