2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire)
32 import Gargantext.Database.Types.Node (NodeType(..))
33 import Gargantext.Database.Node.Document.Add (add)
34 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
35 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
36 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Ext.IMT (toSchoolName)
41 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
42 import Gargantext.Prelude
43 import Gargantext.Text.Parsers (parseDocs, FileFormat)
46 type MasterUserId = Int
50 type MasterCorpusId = Int
52 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
53 flowDatabase ff fp cName = do
55 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
56 params <- flowInsert NodeCorpus hyperdataDocuments cName
57 flowCorpus NodeCorpus hyperdataDocuments params
60 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
61 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
62 flowInsert _nt hyperdataDocuments cName = do
63 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
65 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
66 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId hyperdataDocuments'
68 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
69 _ <- runCmd' $ add userCorpusId (map reId ids)
71 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
74 flowAnnuaire :: FilePath -> IO ()
75 flowAnnuaire filePath = do
76 contacts <- deserialiseImtUsersFromFile filePath
77 ps <- flowInsertAnnuaire "Annuaire" $ take 10 $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
78 printDebug "length annuaire" (ps)
82 flowInsertAnnuaire :: CorpusName
84 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
85 flowInsertAnnuaire name children = do
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId children
90 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
91 _ <- runCmd' $ add userCorpusId (map reId ids)
93 printDebug "AnnuaireID" userCorpusId
95 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
101 flowCorpus :: NodeType
102 -> [HyperdataDocument]
103 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
105 flowCorpus NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
107 --------------------------------------------------
109 userListId <- runCmd' $ listFlowUser userId userCorpusId
110 printDebug "Working on User ListId : " userListId
112 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
113 -- printDebug "documentsWithId" documentsWithId
114 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
115 -- printDebug "docsWithNgrams" docsWithNgrams
116 let maps = mapNodeIdNgrams docsWithNgrams
118 -- printDebug "maps" (maps)
119 indexedNgrams <- runCmd' $ indexNgrams maps
120 -- printDebug "inserted ngrams" indexedNgrams
121 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
123 listId2 <- runCmd' $ listFlow masterUserId masterCorpusId indexedNgrams
124 printDebug "Working on ListId : " listId2
127 --------------------------------------------------
128 _ <- runCmd' $ mkDashboard userCorpusId userId
129 _ <- runCmd' $ mkGraph userCorpusId userId
132 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
135 -- runCmd' $ del [corpusId2, corpusId]
137 flowCorpus NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
138 flowCorpus _ _ _ = undefined
141 type CorpusName = Text
143 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
144 subFlowCorpus username cName = do
145 maybeUserId <- runCmd' (getUser username)
147 let userId = case maybeUserId of
148 Nothing -> panic "Error: User does not exist (yet)"
149 -- mk NodeUser gargantua_id "Node Gargantua"
150 Just user -> userLight_id user
152 rootId' <- map _node_id <$> runCmd' (getRoot userId)
154 rootId'' <- case rootId' of
155 [] -> runCmd' (mkRoot username userId)
156 n -> case length n >= 2 of
157 True -> panic "Error: more than 1 userNode / user"
158 False -> pure rootId'
159 let rootId = maybe (panic "error rootId") identity (head rootId'')
161 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
162 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
164 printDebug "(username, userId, rootId, corpusId)"
165 (username, userId, rootId, corpusId)
166 pure (userId, rootId, corpusId)
169 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
170 subFlowAnnuaire username _cName = do
171 maybeUserId <- runCmd' (getUser username)
173 let userId = case maybeUserId of
174 Nothing -> panic "Error: User does not exist (yet)"
175 -- mk NodeUser gargantua_id "Node Gargantua"
176 Just user -> userLight_id user
178 rootId' <- map _node_id <$> runCmd' (getRoot userId)
180 rootId'' <- case rootId' of
181 [] -> runCmd' (mkRoot username userId)
182 n -> case length n >= 2 of
183 True -> panic "Error: more than 1 userNode / user"
184 False -> pure rootId'
185 let rootId = maybe (panic "error rootId") identity (head rootId'')
187 corpusId' <- runCmd' $ mkAnnuaire rootId userId
188 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
190 printDebug "(username, userId, rootId, corpusId)"
191 (username, userId, rootId, corpusId)
192 pure (userId, rootId, corpusId)
196 ------------------------------------------------------------------------
202 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
203 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
205 err = "Database.Flow.toInsert"
207 toInserted :: [ReturnId] -> Map HashId ReturnId
208 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
209 . filter (\r -> reInserted r == True)
211 data DocumentWithId =
212 DocumentWithId { documentId :: NodeId
213 , documentData :: HyperdataDocument
216 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
217 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
219 toDocumentWithId (hash,hpd) =
220 DocumentWithId <$> fmap reId (lookup hash rs)
223 ------------------------------------------------------------------------
225 data DocumentIdWithNgrams =
227 { documentWithId :: DocumentWithId
228 , document_ngrams :: Map (NgramsT Ngrams) Int
231 -- TODO add Terms (Title + Abstract)
232 -- add f :: Text -> Text
233 -- newtype Ngrams = Ngrams Text
234 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
235 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
236 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
237 <> [(NgramsT Authors a' , 1)| a' <- authors ]
239 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
240 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
241 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
247 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
248 -> [DocumentWithId] -> [DocumentIdWithNgrams]
249 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
251 -- | TODO check optimization
252 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
253 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
255 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
256 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
258 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
259 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
260 indexNgrams ng2nId = do
261 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
262 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
265 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
266 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
267 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
268 | (ng, nId2int) <- DM.toList m
269 , (nId, n) <- DM.toList nId2int
273 ------------------------------------------------------------------------
274 ------------------------------------------------------------------------
275 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
276 listFlow uId cId ngs = do
277 -- printDebug "ngs:" ngs
278 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
279 --printDebug "ngs" (DM.keys ngs)
280 -- TODO add stemming equivalence of 2 ngrams
281 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
282 _ <- insertGroups lId groupEd
284 -- compute Candidate / Map
285 let lists = ngrams2list ngs
286 -- printDebug "lists:" lists
288 is <- insertLists lId lists
289 printDebug "listNgrams inserted :" is
293 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
294 listFlowUser uId cId = mkList cId uId
296 ------------------------------------------------------------------------
298 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
299 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
300 -> Map NgramsIndexed NgramsIndexed
301 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
305 -- TODO check: do not insert duplicates
306 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
307 insertGroups lId ngrs =
308 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
309 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
313 ------------------------------------------------------------------------
314 -- TODO: verify NgramsT lost here
315 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
316 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
318 -- | TODO: weight of the list could be a probability
319 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
320 insertLists lId lngs =
321 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
322 | (l,ngr) <- map (second _ngramsId) lngs
325 ------------------------------------------------------------------------
326 ------------------------------------------------------------------------