]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'annuaire'
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
17 where
18
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
26
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire)
32 import Gargantext.Database.Types.Node (NodeType(..))
33 import Gargantext.Database.Node.Document.Add (add)
34 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
35 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
36 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Ext.IMT (toSchoolName)
41 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
42 import Gargantext.Prelude
43 import Gargantext.Text.Parsers (parseDocs, FileFormat)
44
45 type UserId = Int
46 type MasterUserId = Int
47
48 type RootId = Int
49 type CorpusId = Int
50 type MasterCorpusId = Int
51
52 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
53 flowDatabase ff fp cName = do
54 -- Corpus Flow
55 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
56 params <- flowInsert NodeCorpus hyperdataDocuments cName
57 flowCorpus NodeCorpus hyperdataDocuments params
58
59
60 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
61 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
62 flowInsert _nt hyperdataDocuments cName = do
63 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
64
65 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
66 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId hyperdataDocuments'
67
68 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
69 _ <- runCmd' $ add userCorpusId (map reId ids)
70
71 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
72
73
74 flowAnnuaire :: FilePath -> IO ()
75 flowAnnuaire filePath = do
76 contacts <- deserialiseImtUsersFromFile filePath
77 ps <- flowInsertAnnuaire "Annuaire" $ take 10 $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
78 printDebug "length annuaire" (ps)
79
80 --{-
81
82 flowInsertAnnuaire :: CorpusName
83 -> [ToDbData]
84 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
85 flowInsertAnnuaire name children = do
86
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId children
89
90 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
91 _ <- runCmd' $ add userCorpusId (map reId ids)
92
93 printDebug "AnnuaireID" userCorpusId
94
95 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
96
97
98 --}
99
100 --{-
101 flowCorpus :: NodeType
102 -> [HyperdataDocument]
103 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
104 -> IO CorpusId
105 flowCorpus NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
106 --}
107 --------------------------------------------------
108 -- List Ngrams Flow
109 userListId <- runCmd' $ listFlowUser userId userCorpusId
110 printDebug "Working on User ListId : " userListId
111
112 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
113 -- printDebug "documentsWithId" documentsWithId
114 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
115 -- printDebug "docsWithNgrams" docsWithNgrams
116 let maps = mapNodeIdNgrams docsWithNgrams
117
118 -- printDebug "maps" (maps)
119 indexedNgrams <- runCmd' $ indexNgrams maps
120 -- printDebug "inserted ngrams" indexedNgrams
121 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
122
123 listId2 <- runCmd' $ listFlow masterUserId masterCorpusId indexedNgrams
124 printDebug "Working on ListId : " listId2
125 --}
126
127 --------------------------------------------------
128 _ <- runCmd' $ mkDashboard userCorpusId userId
129 _ <- runCmd' $ mkGraph userCorpusId userId
130
131 -- Annuaire Flow
132 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
133
134 pure userCorpusId
135 -- runCmd' $ del [corpusId2, corpusId]
136
137 flowCorpus NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
138 flowCorpus _ _ _ = undefined
139
140
141 type CorpusName = Text
142
143 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
144 subFlowCorpus username cName = do
145 maybeUserId <- runCmd' (getUser username)
146
147 let userId = case maybeUserId of
148 Nothing -> panic "Error: User does not exist (yet)"
149 -- mk NodeUser gargantua_id "Node Gargantua"
150 Just user -> userLight_id user
151
152 rootId' <- map _node_id <$> runCmd' (getRoot userId)
153
154 rootId'' <- case rootId' of
155 [] -> runCmd' (mkRoot username userId)
156 n -> case length n >= 2 of
157 True -> panic "Error: more than 1 userNode / user"
158 False -> pure rootId'
159 let rootId = maybe (panic "error rootId") identity (head rootId'')
160
161 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
162 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
163
164 printDebug "(username, userId, rootId, corpusId)"
165 (username, userId, rootId, corpusId)
166 pure (userId, rootId, corpusId)
167
168
169 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
170 subFlowAnnuaire username _cName = do
171 maybeUserId <- runCmd' (getUser username)
172
173 let userId = case maybeUserId of
174 Nothing -> panic "Error: User does not exist (yet)"
175 -- mk NodeUser gargantua_id "Node Gargantua"
176 Just user -> userLight_id user
177
178 rootId' <- map _node_id <$> runCmd' (getRoot userId)
179
180 rootId'' <- case rootId' of
181 [] -> runCmd' (mkRoot username userId)
182 n -> case length n >= 2 of
183 True -> panic "Error: more than 1 userNode / user"
184 False -> pure rootId'
185 let rootId = maybe (panic "error rootId") identity (head rootId'')
186
187 corpusId' <- runCmd' $ mkAnnuaire rootId userId
188 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
189
190 printDebug "(username, userId, rootId, corpusId)"
191 (username, userId, rootId, corpusId)
192 pure (userId, rootId, corpusId)
193
194
195
196 ------------------------------------------------------------------------
197
198 type HashId = Text
199 type NodeId = Int
200 type ListId = Int
201
202 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
203 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
204 where
205 err = "Database.Flow.toInsert"
206
207 toInserted :: [ReturnId] -> Map HashId ReturnId
208 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
209 . filter (\r -> reInserted r == True)
210
211 data DocumentWithId =
212 DocumentWithId { documentId :: NodeId
213 , documentData :: HyperdataDocument
214 } deriving (Show)
215
216 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
217 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
218 where
219 toDocumentWithId (hash,hpd) =
220 DocumentWithId <$> fmap reId (lookup hash rs)
221 <*> Just hpd
222
223 ------------------------------------------------------------------------
224
225 data DocumentIdWithNgrams =
226 DocumentIdWithNgrams
227 { documentWithId :: DocumentWithId
228 , document_ngrams :: Map (NgramsT Ngrams) Int
229 } deriving (Show)
230
231 -- TODO add Terms (Title + Abstract)
232 -- add f :: Text -> Text
233 -- newtype Ngrams = Ngrams Text
234 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
235 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
236 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
237 <> [(NgramsT Authors a' , 1)| a' <- authors ]
238 where
239 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
240 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
241 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
242 -- TODO group terms
243
244
245
246
247 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
248 -> [DocumentWithId] -> [DocumentIdWithNgrams]
249 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
250
251 -- | TODO check optimization
252 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
253 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
254 where
255 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
256 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
257
258 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
259 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
260 indexNgrams ng2nId = do
261 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
262 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
263
264
265 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
266 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
267 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
268 | (ng, nId2int) <- DM.toList m
269 , (nId, n) <- DM.toList nId2int
270 ]
271
272
273 ------------------------------------------------------------------------
274 ------------------------------------------------------------------------
275 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
276 listFlow uId cId ngs = do
277 -- printDebug "ngs:" ngs
278 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
279 --printDebug "ngs" (DM.keys ngs)
280 -- TODO add stemming equivalence of 2 ngrams
281 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
282 _ <- insertGroups lId groupEd
283
284 -- compute Candidate / Map
285 let lists = ngrams2list ngs
286 -- printDebug "lists:" lists
287
288 is <- insertLists lId lists
289 printDebug "listNgrams inserted :" is
290
291 pure lId
292
293 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
294 listFlowUser uId cId = mkList cId uId
295
296 ------------------------------------------------------------------------
297
298 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
299 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
300 -> Map NgramsIndexed NgramsIndexed
301 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
302
303
304
305 -- TODO check: do not insert duplicates
306 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
307 insertGroups lId ngrs =
308 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
309 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
310 , ng1 /= ng2
311 ]
312
313 ------------------------------------------------------------------------
314 -- TODO: verify NgramsT lost here
315 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
316 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
317
318 -- | TODO: weight of the list could be a probability
319 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
320 insertLists lId lngs =
321 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
322 | (l,ngr) <- map (second _ngramsId) lngs
323 ]
324
325 ------------------------------------------------------------------------
326 ------------------------------------------------------------------------
327