]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'dev-ngrams-repo' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
16
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
18 where
19
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Database.Metrics.TFICF (getTficf)
37 import Gargantext.Text.Terms (extractTerms)
38 import Gargantext.Text.Metrics.TFICF (Tficf(..))
39 import Gargantext.Database.Node.Document.Add (add)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
41 import Gargantext.Database.Root (getRoot)
42 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
43 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
44 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
45 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
46 import Gargantext.Database.Schema.User (getUser, UserLight(..))
47 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
48 import Gargantext.Database.Utils (Cmd)
49 import Gargantext.Text.Terms (TermType(..))
50 import Gargantext.Ext.IMT (toSchoolName)
51 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
52 import Gargantext.Prelude
53 import Gargantext.Text.Parsers (parseDocs, FileFormat)
54 import System.FilePath (FilePath)
55 import qualified Data.Map as DM
56
57
58 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
59 flowCorpus ff fp cName = do
60 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
61 params <- flowInsert NodeCorpus hyperdataDocuments' cName
62 flowCorpus' NodeCorpus hyperdataDocuments' params
63
64
65 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
66 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
67 flowInsert _nt hyperdataDocuments cName = do
68 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
69
70 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
71 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
72
73 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
74 _ <- add userCorpusId (map reId ids)
75
76 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
77
78
79 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
80 flowAnnuaire filePath = do
81 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
82 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
83 printDebug "length annuaire" ps
84
85
86 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
87 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
88 flowInsertAnnuaire name children = do
89
90 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
91 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
92
93 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
94 _ <- add userCorpusId (map reId ids)
95
96 --printDebug "AnnuaireID" userCorpusId
97
98 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
99
100 -- TODO-ACCESS:
101 -- check userId CanFillUserCorpus userCorpusId
102 -- check masterUserId CanFillMasterCorpus masterCorpusId
103 --
104 -- TODO-EVENTS:
105 -- InsertedNgrams ?
106 -- InsertedNodeNgrams ?
107 flowCorpus' :: HasNodeError err
108 => NodeType -> [HyperdataDocument]
109 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
110 -> Cmd err CorpusId
111 flowCorpus' NodeCorpus hyperdataDocuments (ids,_masterUserId,_masterCorpusId, userId,userCorpusId) = do
112 --------------------------------------------------
113 -- List Ngrams Flow
114 _userListId <- flowListUser userId userCorpusId 500
115 --printDebug "Working on User ListId : " userListId
116
117 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
118 -- printDebug "documentsWithId" documentsWithId
119 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
120 -- printDebug "docsWithNgrams" docsWithNgrams
121 let maps = mapNodeIdNgrams docsWithNgrams
122
123 -- printDebug "maps" (maps)
124 terms2id <- insertNgrams $ DM.keys maps
125 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
126 -- printDebug "inserted ngrams" indexedNgrams
127 _ <- insertToNodeNgrams indexedNgrams
128
129 --listId2 <- flowList masterUserId masterCorpusId indexedNgrams
130 --printDebug "Working on ListId : " listId2
131 --}
132 --------------------------------------------------
133 _ <- mkDashboard userCorpusId userId
134 _ <- mkGraph userCorpusId userId
135
136 -- Annuaire Flow
137 -- _ <- mkAnnuaire rootUserId userId
138
139 pure userCorpusId
140 -- del [corpusId2, corpusId]
141
142 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
143 flowCorpus' _ _ _ = undefined
144
145
146 type CorpusName = Text
147
148 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
149 subFlowCorpus username cName = do
150 maybeUserId <- getUser username
151
152 userId <- case maybeUserId of
153 Nothing -> nodeError NoUserFound
154 -- mk NodeUser gargantua_id "Node Gargantua"
155 Just user -> pure $ userLight_id user
156
157 rootId' <- map _node_id <$> getRoot username
158
159 rootId'' <- case rootId' of
160 [] -> mkRoot username userId
161 n -> case length n >= 2 of
162 True -> nodeError ManyNodeUsers
163 False -> pure rootId'
164 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
165
166 corpusId'' <- if username == userMaster
167 then do
168 ns <- getCorporaWithParentId rootId
169 pure $ map _node_id ns
170 else
171 pure []
172
173 corpusId' <- if corpusId'' /= []
174 then pure corpusId''
175 else mkCorpus (Just cName) Nothing rootId userId
176
177 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
178
179 --printDebug "(username, userId, rootId, corpusId)"
180 -- (username, userId, rootId, corpusId)
181 pure (userId, rootId, corpusId)
182
183
184 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
185 subFlowAnnuaire username _cName = do
186 maybeUserId <- getUser username
187
188 userId <- case maybeUserId of
189 Nothing -> nodeError NoUserFound
190 -- mk NodeUser gargantua_id "Node Gargantua"
191 Just user -> pure $ userLight_id user
192
193 rootId' <- map _node_id <$> getRoot username
194
195 rootId'' <- case rootId' of
196 [] -> mkRoot username userId
197 n -> case length n >= 2 of
198 True -> nodeError ManyNodeUsers
199 False -> pure rootId'
200 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
201
202 corpusId' <- mkAnnuaire rootId userId
203
204 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
205
206 --printDebug "(username, userId, rootId, corpusId)"
207 -- (username, userId, rootId, corpusId)
208 pure (userId, rootId, corpusId)
209
210 ------------------------------------------------------------------------
211 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
212 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
213 where
214 err = "Database.Flow.toInsert"
215
216 toInserted :: [ReturnId] -> Map HashId ReturnId
217 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
218 . filter (\r -> reInserted r == True)
219
220 data DocumentWithId =
221 DocumentWithId { documentId :: !NodeId
222 , documentData :: !HyperdataDocument
223 } deriving (Show)
224
225 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
226 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
227 where
228 toDocumentWithId (hash,hpd) =
229 DocumentWithId <$> fmap reId (lookup hash rs)
230 <*> Just hpd
231
232 ------------------------------------------------------------------------
233 data DocumentIdWithNgrams =
234 DocumentIdWithNgrams
235 { documentWithId :: !DocumentWithId
236 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
237 } deriving (Show)
238
239 -- TODO group terms
240 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
241 extractNgramsT doc = do
242 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
243 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
244 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
245 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
246 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
247
248 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
249 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
250 <> [(a', DM.singleton Authors 1) | a' <- authors ]
251 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
252
253
254
255 documentIdWithNgrams :: HasNodeError err
256 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
257 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
258 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
259 where
260 toDocumentIdWithNgrams d = do
261 e <- f $ documentData d
262 pure $ DocumentIdWithNgrams d e
263
264 -- | TODO check optimization
265 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
266 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
267 where
268 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
269 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
270 where
271 nId = documentId $ documentWithId d
272
273 ------------------------------------------------------------------------
274 flowList :: HasNodeError err => UserId -> CorpusId
275 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
276 flowList uId cId _ngs = do
277 -- printDebug "ngs:" ngs
278 lId <- getOrMkList cId uId
279 --printDebug "ngs" (DM.keys ngs)
280 -- TODO add stemming equivalence of 2 ngrams
281 -- TODO needs rework
282 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
283 -- _ <- insertGroups lId groupEd
284
285 -- compute Candidate / Map
286 --is <- insertLists lId $ ngrams2list ngs
287 --printDebug "listNgrams inserted :" is
288
289 pure lId
290
291 flowListUser :: HasNodeError err => UserId -> CorpusId -> Int -> Cmd err NodeId
292 flowListUser uId cId n = do
293 lId <- getOrMkList cId uId
294 -- is <- insertLists lId $ ngrams2list ngs
295
296 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
297 _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
298
299 pure lId
300
301 ------------------------------------------------------------------------
302
303 {-
304 TODO rework:
305 * quadratic
306 * DM.keys called twice
307 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
308 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
309 -> Map NgramsIndexed NgramsIndexed
310 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
311 -}
312
313
314 -- TODO check: do not insert duplicates
315 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
316 insertGroups lId ngrs =
317 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
318 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
319 , ng1 /= ng2
320 ]
321
322 ------------------------------------------------------------------------
323 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
324 -> [(ListType, (NgramsType,NgramsIndexed))]
325 ngrams2list m =
326 [ (CandidateList, (t, ng))
327 | (ng, tm) <- DM.toList m
328 , t <- DM.keys tm
329 ]
330
331 -- | TODO: weight of the list could be a probability
332 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
333 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
334 | (l,(ngt, ng)) <- lngs
335 ]
336 ------------------------------------------------------------------------
337