2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard)--, mkAnnuaire)
32 import Gargantext.Database.Types.Node (NodeType(..))
33 import Gargantext.Database.Node.Document.Add (add)
34 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
35 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
36 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 import Gargantext.Database.Node.Contact (HyperdataContact(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Ext.IMT (toSchoolName)
41 import Gargantext.Prelude
42 import Gargantext.Text.Parsers (parseDocs, FileFormat)
45 type MasterUserId = Int
51 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
52 flowDatabase ff fp cName = do
54 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
55 params <- flowInsert NodeCorpus hyperdataDocuments cName
56 flowCorpus NodeCorpus hyperdataDocuments params
59 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
60 -> IO ([ReturnId], MasterUserId, UserId, CorpusId)
61 flowInsert nt hyperdataDocuments cName = do
62 let hyperdataDocuments' = case nt of
63 NodeCorpus -> map (\h -> ToDbDocument h) hyperdataDocuments
64 -- NodeAnnuaire -> map (\h -> ToDbContact h) hyperdataDocuments
66 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
67 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId hyperdataDocuments'
69 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
70 _ <- runCmd' $ add userCorpusId (map reId ids)
72 pure (ids, masterUserId, userId, userCorpusId)
75 flowInsertAnnuaire children name = do
77 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
78 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId children
80 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary name
81 _ <- runCmd' $ add userCorpusId (map reId ids)
83 pure (ids, masterUserId, userId, userCorpusId)
89 -- flowCorpus :: NodeType -> [HyperdataDocument] -> ([ReturnId],MasterUserId,UserId,CorpusId) -> IO CorpusId
90 flowCorpus NodeCorpus hyperdataDocuments (ids,masterUserId,userId,userCorpusId) = do
92 --------------------------------------------------
94 userListId <- runCmd' $ listFlowUser userId userCorpusId
95 printDebug "Working on User ListId : " userListId
97 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
98 -- printDebug "documentsWithId" documentsWithId
99 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
100 -- printDebug "docsWithNgrams" docsWithNgrams
101 let maps = mapNodeIdNgrams docsWithNgrams
103 -- printDebug "maps" (maps)
104 indexedNgrams <- runCmd' $ indexNgrams maps
105 -- printDebug "inserted ngrams" indexedNgrams
106 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
108 listId2 <- runCmd' $ listFlow masterUserId userCorpusId indexedNgrams
109 printDebug "Working on ListId : " listId2
112 --------------------------------------------------
113 _ <- runCmd' $ mkDashboard userCorpusId userId
114 _ <- runCmd' $ mkGraph userCorpusId userId
117 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
120 -- runCmd' $ del [corpusId2, corpusId]
122 flowCorpus NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_userId,_userCorpusId) = undefined
123 flowCorpus _ _ _ = undefined
125 type CorpusName = Text
127 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
128 subFlowCorpus username cName = do
129 maybeUserId <- runCmd' (getUser username)
131 let userId = case maybeUserId of
132 Nothing -> panic "Error: User does not exist (yet)"
133 -- mk NodeUser gargantua_id "Node Gargantua"
134 Just user -> userLight_id user
136 rootId' <- map _node_id <$> runCmd' (getRoot userId)
138 rootId'' <- case rootId' of
139 [] -> runCmd' (mkRoot username userId)
140 n -> case length n >= 2 of
141 True -> panic "Error: more than 1 userNode / user"
142 False -> pure rootId'
143 let rootId = maybe (panic "error rootId") identity (head rootId'')
145 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
146 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
148 printDebug "(username, userId, rootId, corpusId)"
149 (username, userId, rootId, corpusId)
150 pure (userId, rootId, corpusId)
152 ------------------------------------------------------------------------
158 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
159 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
161 err = "Database.Flow.toInsert"
163 toInserted :: [ReturnId] -> Map HashId ReturnId
164 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
165 . filter (\r -> reInserted r == True)
167 data DocumentWithId =
168 DocumentWithId { documentId :: NodeId
169 , documentData :: HyperdataDocument
172 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
173 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
175 toDocumentWithId (hash,hpd) =
176 DocumentWithId <$> fmap reId (lookup hash rs)
179 ------------------------------------------------------------------------
181 data DocumentIdWithNgrams =
183 { documentWithId :: DocumentWithId
184 , document_ngrams :: Map (NgramsT Ngrams) Int
187 -- TODO add Terms (Title + Abstract)
188 -- add f :: Text -> Text
189 -- newtype Ngrams = Ngrams Text
190 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
191 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
192 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
193 <> [(NgramsT Authors a' , 1)| a' <- authors ]
195 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
196 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
197 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
203 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
204 -> [DocumentWithId] -> [DocumentIdWithNgrams]
205 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
207 -- | TODO check optimization
208 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
209 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
211 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
212 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
214 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
215 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
216 indexNgrams ng2nId = do
217 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
218 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
221 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
222 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
223 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
224 | (ng, nId2int) <- DM.toList m
225 , (nId, n) <- DM.toList nId2int
229 ------------------------------------------------------------------------
230 ------------------------------------------------------------------------
231 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
232 listFlow uId cId ngs = do
233 -- printDebug "ngs:" ngs
234 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
235 --printDebug "ngs" (DM.keys ngs)
236 -- TODO add stemming equivalence of 2 ngrams
237 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
238 _ <- insertGroups lId groupEd
240 -- compute Candidate / Map
241 let lists = ngrams2list ngs
242 -- printDebug "lists:" lists
244 is <- insertLists lId lists
245 printDebug "listNgrams inserted :" is
249 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
250 listFlowUser uId cId = mkList cId uId
252 ------------------------------------------------------------------------
254 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
255 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
256 -> Map NgramsIndexed NgramsIndexed
257 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
261 -- TODO check: do not insert duplicates
262 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
263 insertGroups lId ngrs =
264 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
265 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
269 ------------------------------------------------------------------------
270 -- TODO: verify NgramsT lost here
271 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
272 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
274 -- | TODO: weight of the list could be a probability
275 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
276 insertLists lId lngs =
277 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
278 | (l,ngr) <- map (second _ngramsId) lngs
281 ------------------------------------------------------------------------
282 ------------------------------------------------------------------------