2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard)--, mkAnnuaire)
32 import Gargantext.Database.Node.Document.Add (add)
33 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, ToDbData(..))
34 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
35 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 -- import Gargantext.Database.Node.Contact (HyperdataContact(..))
38 import Gargantext.Database.User (getUser, UserLight(..), Username)
39 import Gargantext.Ext.IMT (toSchoolName)
40 import Gargantext.Prelude
41 import Gargantext.Text.Parsers (parseDocs, FileFormat)
48 flowCorpus :: [ToDbData] -> CorpusName -> IO CorpusId
49 flowCorpus = undefined
52 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO Int
53 flowDatabase ff fp cName = do
55 (masterUserId, _, corpusId) <- subFlowCorpus userMaster corpusMasterName
58 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
59 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
60 printDebug "hyperdataDocuments" (length hyperdataDocuments)
65 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
66 -- printDebug "Docs IDs : " (ids)
67 -- idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments'
68 -- printDebug "Repeated Docs IDs : " (length idsRepeat)
69 let idsNotRepeated = filter (\r -> reInserted r == True) ids
72 -- todo: flow for new documents only
73 let tids = toInserted ids
74 printDebug "toInserted ids" (length tids)
76 let tihs = toInsert hyperdataDocuments
77 printDebug "toInsert hyperdataDocuments" (length tihs)
79 let documentsWithId = mergeData (toInserted idsNotRepeated) (toInsert hyperdataDocuments)
80 -- printDebug "documentsWithId" documentsWithId
82 -- docsWithNgrams <- documentIdWithNgrams documentsWithId extractNgramsT
83 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
84 -- printDebug "docsWithNgrams" docsWithNgrams
86 let maps = mapNodeIdNgrams docsWithNgrams
87 -- printDebug "maps" (maps)
89 indexedNgrams <- runCmd' $ indexNgrams maps
90 -- printDebug "inserted ngrams" indexedNgrams
91 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
94 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
95 printDebug "list id : " listId2
98 (userId, _, corpusId2) <- subFlowCorpus userArbitrary cName
100 userListId <- runCmd' $ listFlowUser userId corpusId2
101 printDebug "UserList : " userListId
102 inserted <- runCmd' $ add corpusId2 (map reId ids)
103 printDebug "Added : " (length inserted)
105 _ <- runCmd' $ mkDashboard corpusId2 userId
106 _ <- runCmd' $ mkGraph corpusId2 userId
109 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
112 -- runCmd' $ del [corpusId2, corpusId]
114 type CorpusName = Text
116 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
117 subFlowCorpus username cName = do
118 maybeUserId <- runCmd' (getUser username)
120 let userId = case maybeUserId of
121 Nothing -> panic "Error: User does not exist (yet)"
122 -- mk NodeUser gargantua_id "Node Gargantua"
123 Just user -> userLight_id user
125 rootId' <- map _node_id <$> runCmd' (getRoot userId)
127 rootId'' <- case rootId' of
128 [] -> runCmd' (mkRoot username userId)
129 n -> case length n >= 2 of
130 True -> panic "Error: more than 1 userNode / user"
131 False -> pure rootId'
132 let rootId = maybe (panic "error rootId") identity (head rootId'')
134 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
135 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
137 printDebug "(username, userId, rootId, corpusId)"
138 (username, userId, rootId, corpusId)
139 pure (userId, rootId, corpusId)
141 ------------------------------------------------------------------------
147 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
148 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
150 hash = maybe "Error" identity
152 toInserted :: [ReturnId] -> Map HashId ReturnId
153 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
154 $ filter (\r -> reInserted r == True) rs
156 data DocumentWithId =
157 DocumentWithId { documentId :: NodeId
158 , documentData :: HyperdataDocument
161 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
162 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
164 toDocumentWithId (hash,hpd) =
165 DocumentWithId <$> fmap reId (lookup hash rs)
168 ------------------------------------------------------------------------
170 data DocumentIdWithNgrams =
172 { documentWithId :: DocumentWithId
173 , document_ngrams :: Map (NgramsT Ngrams) Int
176 -- TODO add Terms (Title + Abstract)
177 -- add f :: Text -> Text
178 -- newtype Ngrams = Ngrams Text
179 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
180 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
181 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
182 <> [(NgramsT Authors a' , 1)| a' <- authors ]
184 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
185 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
186 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
189 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
190 -> [DocumentWithId] -> [DocumentIdWithNgrams]
191 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
193 -- | TODO check optimization
194 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
195 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
197 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
198 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
200 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
201 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
202 indexNgrams ng2nId = do
203 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
204 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
207 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
208 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
209 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
210 | (ng, nId2int) <- DM.toList m
211 , (nId, n) <- DM.toList nId2int
215 ------------------------------------------------------------------------
216 ------------------------------------------------------------------------
217 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
218 listFlow uId cId ngs = do
219 -- printDebug "ngs:" ngs
220 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
221 --printDebug "ngs" (DM.keys ngs)
222 -- TODO add stemming equivalence of 2 ngrams
223 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
224 _ <- insertGroups lId groupEd
226 -- compute Candidate / Map
227 let lists = ngrams2list ngs
228 -- printDebug "lists:" lists
230 is <- insertLists lId lists
231 printDebug "listNgrams inserted :" is
235 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
236 listFlowUser uId cId = mkList cId uId
238 ------------------------------------------------------------------------
240 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
241 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
242 -> Map NgramsIndexed NgramsIndexed
243 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
247 -- TODO check: do not insert duplicates
248 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
249 insertGroups lId ngrs =
250 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
251 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
255 ------------------------------------------------------------------------
256 -- TODO: verify NgramsT lost here
257 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
258 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
260 -- | TODO: weight of the list could be a probability
261 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
262 insertLists lId lngs =
263 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
264 | (l,ngr) <- map (second _ngramsId) lngs
267 ------------------------------------------------------------------------
268 ------------------------------------------------------------------------