2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
25 module Gargantext.Database.Flow (flowDatabase)
27 import System.FilePath (FilePath)
28 import Data.Maybe (Maybe(..), catMaybes)
29 import Data.Text (Text)
31 import Data.Tuple.Extra (both, second)
32 import qualified Data.Map as DM
34 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
35 import Gargantext.Database.Bashql (runCmd')--, del)
36 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
37 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
38 import Gargantext.Database.Node.Document.Add (add)
39 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
40 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
41 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
42 import Gargantext.Database.Types.Node (HyperdataDocument(..))
43 import Gargantext.Database.User (getUser, UserLight(..), Username)
44 import Gargantext.Prelude
45 import Gargantext.Text.Parsers (parseDocs, FileFormat)
51 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO [Int]
52 flowDatabase ff fp cName = do
55 (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
58 hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
59 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
60 printDebug "Docs IDs : " (length ids)
61 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
62 printDebug "Repeated Docs IDs : " (length ids)
65 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
66 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
67 let maps = mapNodeIdNgrams docsWithNgrams
68 indexedNgrams <- runCmd' $ indexNgrams maps
69 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
72 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
73 printDebug "list id : " listId2
75 printDebug "Docs IDs : " (length idsRepeat)
77 (_, _, corpusId2) <- subFlow "alexandre" cName
78 inserted <- runCmd' $ add corpusId2 (map reId ids)
79 printDebug "Inserted : " (length inserted)
81 pure [corpusId2, corpusId]
83 --runCmd' $ del [corpusId2, corpusId]
85 type CorpusName = Text
87 subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
88 subFlow username cName = do
89 maybeUserId <- runCmd' (getUser username)
91 let userId = case maybeUserId of
92 Nothing -> panic "Error: User does not exist (yet)"
93 -- mk NodeUser gargantua_id "Node Gargantua"
94 Just user -> userLight_id user
96 rootId' <- map _node_id <$> runCmd' (getRoot userId)
98 rootId'' <- case rootId' of
99 [] -> runCmd' (mkRoot userId)
100 n -> case length n >= 2 of
101 True -> panic "Error: more than 1 userNode / user"
102 False -> pure rootId'
103 let rootId = maybe (panic "error rootId") identity (head rootId'')
105 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
106 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
108 printDebug "(username, userId, rootId, corpusId)"
109 (username, userId, rootId, corpusId)
110 pure (userId, rootId, corpusId)
112 ------------------------------------------------------------------------
118 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
119 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
121 hash = maybe "Error" identity
123 toInserted :: [ReturnId] -> Map HashId ReturnId
124 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
125 $ filter (\r -> reInserted r == True) rs
127 data DocumentWithId =
128 DocumentWithId { documentId :: NodeId
129 , documentData :: HyperdataDocument
132 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
133 mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
135 lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
137 ------------------------------------------------------------------------
139 data DocumentIdWithNgrams =
141 { documentWithId :: DocumentWithId
142 , document_ngrams :: Map (NgramsT Ngrams) Int
145 -- TODO add Authors and Terms (Title + Abstract)
146 -- add f :: Text -> Text
147 -- newtype Ngrams = Ngrams Text
148 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
149 extractNgramsT doc = DM.fromList $ [(NgramsT Sources ngrams, 1)]
151 ngrams = text2ngrams $ maybe "Nothing" identity maybeNgrams
152 maybeNgrams = _hyperdataDocument_source doc
154 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
155 -> [DocumentWithId] -> [DocumentIdWithNgrams]
156 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
158 -- | TODO check optimization
159 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
160 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
162 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
163 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
165 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
166 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
167 indexNgrams ng2nId = do
168 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
169 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
172 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
173 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
174 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
175 | (ng, nId2int) <- DM.toList m
176 , (nId, n) <- DM.toList nId2int
180 ------------------------------------------------------------------------
181 ------------------------------------------------------------------------
182 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
183 listFlow uId cId ng = do
184 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
185 -- TODO add stemming equivalence of 2 ngrams
186 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ng
187 _ <- insertGroups lId groupEd
189 -- compute Candidate / Map
190 let lists = ngrams2list ng
191 _ <- insertLists lId lists
195 ------------------------------------------------------------------------
197 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
198 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
199 -> Map NgramsIndexed NgramsIndexed
200 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
204 -- TODO check: do not insert duplicates
205 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
206 insertGroups lId ngrs =
207 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
208 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
211 ------------------------------------------------------------------------
212 -- TODO: verify NgramsT lost here
213 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map ListType NgramsIndexed
214 ngrams2list = DM.fromList . zip (repeat Candidate) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
216 -- | TODO: weight of the list could be a probability
217 insertLists :: ListId -> Map ListType NgramsIndexed -> Cmd Int
218 insertLists lId list2ngrams =
219 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
220 | (l,ngr) <- map (second _ngramsId) $ DM.toList list2ngrams
223 ------------------------------------------------------------------------
224 ------------------------------------------------------------------------