2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
25 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
28 import GHC.Show (Show)
29 import System.FilePath (FilePath)
30 import Data.Maybe (Maybe(..), catMaybes)
31 import Data.Text (Text, splitOn)
33 import Data.Tuple.Extra (both, second)
34 import qualified Data.Map as DM
36 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
37 import Gargantext.Database.Bashql (runCmd', del)
38 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
39 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
40 import Gargantext.Database.Node.Document.Add (add)
41 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.User (getUser, UserLight(..), Username)
46 import Gargantext.Prelude
47 import Gargantext.Text.Parsers (parseDocs, FileFormat)
48 import Gargantext.Ext.IMT (toSchoolName)
54 --flowDatabase :: FileFormat -> FilePath -> CorpusName -> Cmd Int
55 flowDatabase ff fp cName = do
58 (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
61 hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
63 --printDebug "hyperdataDocuments" hyperdataDocuments
65 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
66 --printDebug "Docs IDs : " (ids)
67 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
68 --printDebug "Repeated Docs IDs : " (length ids)
71 -- todo: flow for new documents only
72 -- let tids = toInserted ids
73 --printDebug "toInserted ids" (length tids, tids)
75 -- let tihs = toInsert hyperdataDocuments
76 --printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
78 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
79 -- printDebug "documentsWithId" documentsWithId
81 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
82 printDebug "docsWithNgrams" docsWithNgrams
84 let maps = mapNodeIdNgrams docsWithNgrams
85 printDebug "maps" (maps)
87 indexedNgrams <- runCmd' $ indexNgrams maps
88 printDebug "inserted ngrams" indexedNgrams
89 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
92 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
93 printDebug "list id : " listId2
95 (userId, _, corpusId2) <- subFlow "user1" cName
97 userListId <- runCmd' $ listFlowUser userId corpusId2
98 printDebug "UserList : " userListId
100 inserted <- runCmd' $ add corpusId2 (map reId ids)
101 printDebug "Inserted : " (length inserted)
104 -- runCmd' $ del [corpusId2, corpusId]
106 type CorpusName = Text
108 subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
109 subFlow username cName = do
110 maybeUserId <- runCmd' (getUser username)
112 let userId = case maybeUserId of
113 Nothing -> panic "Error: User does not exist (yet)"
114 -- mk NodeUser gargantua_id "Node Gargantua"
115 Just user -> userLight_id user
117 rootId' <- map _node_id <$> runCmd' (getRoot userId)
119 rootId'' <- case rootId' of
120 [] -> runCmd' (mkRoot username userId)
121 n -> case length n >= 2 of
122 True -> panic "Error: more than 1 userNode / user"
123 False -> pure rootId'
124 let rootId = maybe (panic "error rootId") identity (head rootId'')
126 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
127 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
129 printDebug "(username, userId, rootId, corpusId)"
130 (username, userId, rootId, corpusId)
131 pure (userId, rootId, corpusId)
133 ------------------------------------------------------------------------
139 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
140 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
142 hash = maybe "Error" identity
144 toInserted :: [ReturnId] -> Map HashId ReturnId
145 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
146 $ filter (\r -> reInserted r == True) rs
148 data DocumentWithId =
149 DocumentWithId { documentId :: NodeId
150 , documentData :: HyperdataDocument
153 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
154 mergeData rs hs = map (\(hash,hpd) -> DocumentWithId (lookup' hash rs) hpd) $ DM.toList hs
156 lookup' h xs = maybe (panic $ message <> h) reId (DM.lookup h xs)
157 message = "Database.Flow.mergeData: Error with "
159 ------------------------------------------------------------------------
161 data DocumentIdWithNgrams =
163 { documentWithId :: DocumentWithId
164 , document_ngrams :: Map (NgramsT Ngrams) Int
167 -- TODO add Terms (Title + Abstract)
168 -- add f :: Text -> Text
169 -- newtype Ngrams = Ngrams Text
170 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
171 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
172 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
173 <> [(NgramsT Authors a' , 1)| a' <- authors ]
175 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
176 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
177 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
180 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
181 -> [DocumentWithId] -> [DocumentIdWithNgrams]
182 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
184 -- | TODO check optimization
185 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
186 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
188 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
189 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
191 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
192 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
193 indexNgrams ng2nId = do
194 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
195 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
198 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
199 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
200 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
201 | (ng, nId2int) <- DM.toList m
202 , (nId, n) <- DM.toList nId2int
206 ------------------------------------------------------------------------
207 ------------------------------------------------------------------------
208 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
209 listFlow uId cId ngs = do
210 printDebug "ngs:" ngs
211 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
212 printDebug "ngs" (DM.keys ngs)
213 -- TODO add stemming equivalence of 2 ngrams
214 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
215 _ <- insertGroups lId groupEd
217 -- compute Candidate / Map
218 let lists = ngrams2list ngs
219 printDebug "lists:" lists
221 is <- insertLists lId lists
222 printDebug "listNgrams inserted :" is
226 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
227 listFlowUser uId cId = mkList cId uId
229 ------------------------------------------------------------------------
231 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
232 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
233 -> Map NgramsIndexed NgramsIndexed
234 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
238 -- TODO check: do not insert duplicates
239 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
240 insertGroups lId ngrs =
241 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
242 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
246 ------------------------------------------------------------------------
247 -- TODO: verify NgramsT lost here
248 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
249 ngrams2list = zip (repeat Candidate) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
251 -- | TODO: weight of the list could be a probability
252 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
253 insertLists lId lngs =
254 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
255 | (l,ngr) <- map (second _ngramsId) lngs
258 ------------------------------------------------------------------------
259 ------------------------------------------------------------------------