]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FLOW] optim ok.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
18 where
19
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Text.Terms (extractTerms)
37 import Gargantext.Database.Node.Document.Add (add)
38 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
39 import Gargantext.Database.Root (getRoot)
40 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams, ngramsTypeId)
41 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
42 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Schema.User (getUser, UserLight(..))
45 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
46 import Gargantext.Database.Utils (Cmd)
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
54 import qualified Data.Set as DS
55
56
57 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
58 flowCorpus ff fp cName = do
59 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
60 params <- flowInsert NodeCorpus hyperdataDocuments' cName
61 flowCorpus' NodeCorpus hyperdataDocuments' params
62
63
64 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
65 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
66 flowInsert _nt hyperdataDocuments cName = do
67 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
68
69 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
70 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
71
72 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
73 _ <- add userCorpusId (map reId ids)
74
75 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
76
77
78 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
79 flowAnnuaire filePath = do
80 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
81 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
82 printDebug "length annuaire" ps
83
84
85 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
86 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
87 flowInsertAnnuaire name children = do
88
89 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
90 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
91
92 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
93 _ <- add userCorpusId (map reId ids)
94
95 printDebug "AnnuaireID" userCorpusId
96
97 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
98
99
100 flowCorpus' :: HasNodeError err
101 => NodeType -> [HyperdataDocument]
102 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
103 -> Cmd err CorpusId
104 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
105 --------------------------------------------------
106 -- List Ngrams Flow
107 userListId <- flowListUser userId userCorpusId
108 printDebug "Working on User ListId : " userListId
109
110 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
111 -- printDebug "documentsWithId" documentsWithId
112 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
113 -- printDebug "docsWithNgrams" docsWithNgrams
114 let maps = mapNodeIdNgrams docsWithNgrams
115
116 -- printDebug "maps" (maps)
117 terms2id <- insertNgrams (DS.toList $ DS.map _ngramsT (DM.keysSet maps))
118 let indexedNgrams = DM.mapKeys (indexNgramsT terms2id) maps
119 -- printDebug "inserted ngrams" indexedNgrams
120 _ <- insertToNodeNgrams indexedNgrams
121
122 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
123 printDebug "Working on ListId : " listId2
124 --}
125 --------------------------------------------------
126 _ <- mkDashboard userCorpusId userId
127 _ <- mkGraph userCorpusId userId
128
129 -- Annuaire Flow
130 -- _ <- mkAnnuaire rootUserId userId
131
132 pure userCorpusId
133 -- del [corpusId2, corpusId]
134
135 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
136 flowCorpus' _ _ _ = undefined
137
138
139 type CorpusName = Text
140
141 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
142 subFlowCorpus username cName = do
143 maybeUserId <- getUser username
144
145 userId <- case maybeUserId of
146 Nothing -> nodeError NoUserFound
147 -- mk NodeUser gargantua_id "Node Gargantua"
148 Just user -> pure $ userLight_id user
149
150 rootId' <- map _node_id <$> getRoot username
151
152 rootId'' <- case rootId' of
153 [] -> mkRoot username userId
154 n -> case length n >= 2 of
155 True -> nodeError ManyNodeUsers
156 False -> pure rootId'
157 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
158
159 corpusId'' <- if username == userMaster
160 then do
161 ns <- getCorporaWithParentId rootId
162 pure $ map _node_id ns
163 else
164 pure []
165
166 corpusId' <- if corpusId'' /= []
167 then pure corpusId''
168 else mkCorpus (Just cName) Nothing rootId userId
169
170 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
171
172 printDebug "(username, userId, rootId, corpusId)"
173 (username, userId, rootId, corpusId)
174 pure (userId, rootId, corpusId)
175
176
177 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
178 subFlowAnnuaire username _cName = do
179 maybeUserId <- getUser username
180
181 userId <- case maybeUserId of
182 Nothing -> nodeError NoUserFound
183 -- mk NodeUser gargantua_id "Node Gargantua"
184 Just user -> pure $ userLight_id user
185
186 rootId' <- map _node_id <$> getRoot username
187
188 rootId'' <- case rootId' of
189 [] -> mkRoot username userId
190 n -> case length n >= 2 of
191 True -> nodeError ManyNodeUsers
192 False -> pure rootId'
193 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
194
195 corpusId' <- mkAnnuaire rootId userId
196
197 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
198
199 printDebug "(username, userId, rootId, corpusId)"
200 (username, userId, rootId, corpusId)
201 pure (userId, rootId, corpusId)
202
203 ------------------------------------------------------------------------
204 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
205 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
206 where
207 err = "Database.Flow.toInsert"
208
209 toInserted :: [ReturnId] -> Map HashId ReturnId
210 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
211 . filter (\r -> reInserted r == True)
212
213 data DocumentWithId =
214 DocumentWithId { documentId :: !NodeId
215 , documentData :: !HyperdataDocument
216 } deriving (Show)
217
218 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
219 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
220 where
221 toDocumentWithId (hash,hpd) =
222 DocumentWithId <$> fmap reId (lookup hash rs)
223 <*> Just hpd
224
225 ------------------------------------------------------------------------
226 data DocumentIdWithNgrams =
227 DocumentIdWithNgrams
228 { documentWithId :: !DocumentWithId
229 , document_ngrams :: !(Map (NgramsT Ngrams) Int)
230 } deriving (Show)
231
232 -- TODO group terms
233 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int)
234 extractNgramsT doc = do
235
236 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
237 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
238 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
239 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
240 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
241
242 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
243 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
244 <> [(NgramsT Authors a' , 1)| a' <- authors ]
245 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
246
247
248
249 documentIdWithNgrams :: HasNodeError err => (HyperdataDocument -> Cmd err (Map (NgramsT Ngrams) Int))
250 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
251 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
252 where
253 toDocumentIdWithNgrams d = do
254 e <- f $ documentData d
255 pure $ DocumentIdWithNgrams d e
256
257 -- | TODO check optimization
258 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
259 mapNodeIdNgrams ds = DM.fromListWith (DM.unionWith (+)) xs
260 where
261 xs = [(ng, DM.singleton nId i) | (nId, n2i') <- ds', (ng, i) <- DM.toList n2i']
262 ds' = (\d -> ((documentId . documentWithId) d, document_ngrams d)) <$> ds
263
264 ------------------------------------------------------------------------
265 flowList :: HasNodeError err => UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd err ListId
266 flowList uId cId ngs = do
267 -- printDebug "ngs:" ngs
268 lId <- getOrMkList cId uId
269 --printDebug "ngs" (DM.keys ngs)
270 -- TODO add stemming equivalence of 2 ngrams
271 -- TODO needs rework
272 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
273 -- _ <- insertGroups lId groupEd
274
275 -- compute Candidate / Map
276 is <- insertLists lId $ ngrams2list ngs
277 printDebug "listNgrams inserted :" is
278
279 pure lId
280
281 flowListUser :: HasNodeError err => UserId -> CorpusId -> Cmd err NodeId
282 flowListUser uId cId = getOrMkList cId uId
283
284 ------------------------------------------------------------------------
285
286 {-
287 TODO rework:
288 * quadratic
289 * DM.keys called twice
290 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
291 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
292 -> Map NgramsIndexed NgramsIndexed
293 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
294 -}
295
296
297 -- TODO check: do not insert duplicates
298 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
299 insertGroups lId ngrs =
300 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
301 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
302 , ng1 /= ng2
303 ]
304
305 ------------------------------------------------------------------------
306 -- TODO: verify NgramsT lost here
307 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType, (NgramsType,NgramsIndexed))]
308 ngrams2list = zip (repeat GraphList) . map (\(NgramsT ngt ng) -> (ngt, ng)) . DM.keys
309
310 -- | TODO: weight of the list could be a probability
311 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
312 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
313 | (l,(ngt, ng)) <- lngs
314 ]
315 ------------------------------------------------------------------------