2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard)--, mkAnnuaire)
32 import Gargantext.Database.Types.Node (NodeType(..))
33 import Gargantext.Database.Node.Document.Add (add)
34 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
35 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
36 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 import Gargantext.Database.Node.Contact (HyperdataContact(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Ext.IMT (toSchoolName)
41 import Gargantext.Prelude
42 import Gargantext.Text.Parsers (parseDocs, FileFormat)
45 type MasterUserId = Int
51 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
52 flowDatabase ff fp cName = do
54 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
55 params <- flowInsert NodeCorpus hyperdataDocuments cName
56 flowCorpus NodeCorpus hyperdataDocuments params
59 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
60 -> IO ([ReturnId], MasterUserId, UserId, CorpusId)
61 flowInsert nt hyperdataDocuments cName = do
62 let hyperdataDocuments' = case nt of
63 NodeCorpus -> map (\h -> ToDbDocument h) hyperdataDocuments
64 -- NodeAnnuaire -> map (\h -> ToDbContact h) hyperdataDocuments
66 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
67 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId hyperdataDocuments'
69 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
70 _ <- runCmd' $ add userCorpusId (map reId ids)
72 pure (ids, masterUserId, userId, userCorpusId)
75 flowInsertAnnuaire name children = do
77 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
78 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId children
80 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary name
81 _ <- runCmd' $ add userCorpusId (map reId ids)
83 printDebug "AnnuaireID" userCorpusId
85 pure (ids, masterUserId, userId, userCorpusId)
91 -- flowCorpus :: NodeType -> [HyperdataDocument] -> ([ReturnId],MasterUserId,UserId,CorpusId) -> IO CorpusId
92 flowCorpus NodeCorpus hyperdataDocuments (ids,masterUserId,userId,userCorpusId) = do
94 --------------------------------------------------
96 userListId <- runCmd' $ listFlowUser userId userCorpusId
97 printDebug "Working on User ListId : " userListId
99 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
100 -- printDebug "documentsWithId" documentsWithId
101 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
102 -- printDebug "docsWithNgrams" docsWithNgrams
103 let maps = mapNodeIdNgrams docsWithNgrams
105 -- printDebug "maps" (maps)
106 indexedNgrams <- runCmd' $ indexNgrams maps
107 -- printDebug "inserted ngrams" indexedNgrams
108 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
110 listId2 <- runCmd' $ listFlow masterUserId userCorpusId indexedNgrams
111 printDebug "Working on ListId : " listId2
114 --------------------------------------------------
115 _ <- runCmd' $ mkDashboard userCorpusId userId
116 _ <- runCmd' $ mkGraph userCorpusId userId
119 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
122 -- runCmd' $ del [corpusId2, corpusId]
124 flowCorpus NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_userId,_userCorpusId) = undefined
125 flowCorpus _ _ _ = undefined
128 type CorpusName = Text
130 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
131 subFlowCorpus username cName = do
132 maybeUserId <- runCmd' (getUser username)
134 let userId = case maybeUserId of
135 Nothing -> panic "Error: User does not exist (yet)"
136 -- mk NodeUser gargantua_id "Node Gargantua"
137 Just user -> userLight_id user
139 rootId' <- map _node_id <$> runCmd' (getRoot userId)
141 rootId'' <- case rootId' of
142 [] -> runCmd' (mkRoot username userId)
143 n -> case length n >= 2 of
144 True -> panic "Error: more than 1 userNode / user"
145 False -> pure rootId'
146 let rootId = maybe (panic "error rootId") identity (head rootId'')
148 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
149 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
151 printDebug "(username, userId, rootId, corpusId)"
152 (username, userId, rootId, corpusId)
153 pure (userId, rootId, corpusId)
155 ------------------------------------------------------------------------
161 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
162 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
164 err = "Database.Flow.toInsert"
166 toInserted :: [ReturnId] -> Map HashId ReturnId
167 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
168 . filter (\r -> reInserted r == True)
170 data DocumentWithId =
171 DocumentWithId { documentId :: NodeId
172 , documentData :: HyperdataDocument
175 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
176 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
178 toDocumentWithId (hash,hpd) =
179 DocumentWithId <$> fmap reId (lookup hash rs)
182 ------------------------------------------------------------------------
184 data DocumentIdWithNgrams =
186 { documentWithId :: DocumentWithId
187 , document_ngrams :: Map (NgramsT Ngrams) Int
190 -- TODO add Terms (Title + Abstract)
191 -- add f :: Text -> Text
192 -- newtype Ngrams = Ngrams Text
193 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
194 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
195 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
196 <> [(NgramsT Authors a' , 1)| a' <- authors ]
198 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
199 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
200 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
206 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
207 -> [DocumentWithId] -> [DocumentIdWithNgrams]
208 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
210 -- | TODO check optimization
211 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
212 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
214 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
215 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
217 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
218 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
219 indexNgrams ng2nId = do
220 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
221 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
224 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
225 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
226 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
227 | (ng, nId2int) <- DM.toList m
228 , (nId, n) <- DM.toList nId2int
232 ------------------------------------------------------------------------
233 ------------------------------------------------------------------------
234 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
235 listFlow uId cId ngs = do
236 -- printDebug "ngs:" ngs
237 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
238 --printDebug "ngs" (DM.keys ngs)
239 -- TODO add stemming equivalence of 2 ngrams
240 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
241 _ <- insertGroups lId groupEd
243 -- compute Candidate / Map
244 let lists = ngrams2list ngs
245 -- printDebug "lists:" lists
247 is <- insertLists lId lists
248 printDebug "listNgrams inserted :" is
252 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
253 listFlowUser uId cId = mkList cId uId
255 ------------------------------------------------------------------------
257 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
258 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
259 -> Map NgramsIndexed NgramsIndexed
260 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
264 -- TODO check: do not insert duplicates
265 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
266 insertGroups lId ngrs =
267 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
268 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
272 ------------------------------------------------------------------------
273 -- TODO: verify NgramsT lost here
274 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
275 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
277 -- | TODO: weight of the list could be a probability
278 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
279 insertLists lId lngs =
280 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
281 | (l,ngr) <- map (second _ngramsId) lngs
284 ------------------------------------------------------------------------
285 ------------------------------------------------------------------------