2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
25 module Gargantext.Database.Flow
27 import System.FilePath (FilePath)
28 import Data.Maybe (Maybe(..))
29 import Data.Text (Text, unpack)
31 import qualified Data.Map as DM
32 import GHC.Generics (Generic)
34 import Gargantext.Core.Types (NodePoly(..))
35 import Gargantext.Prelude
36 import Gargantext.Database.Bashql (runCmd', del)
37 import Gargantext.Database.Types.Node (Node(..), HyperdataDocument(..))
38 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
41 import Gargantext.Database.Node.Document.Add (add)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
44 import Gargantext.Database.Ngram (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId)
50 subFlow :: Username -> IO (UserId, RootId, CorpusId)
52 maybeUserId <- runCmd' (getUser username)
54 let userId = case maybeUserId of
55 Nothing -> panic "Error: User does not exist (yet)"
56 -- mk NodeUser gargantua_id "Node Gargantua"
57 Just user -> userLight_id user
59 rootId' <- map _node_id <$> runCmd' (getRoot userId)
61 rootId'' <- case rootId' of
62 [] -> runCmd' (mkRoot userId)
63 un -> case length un >= 2 of
64 True -> panic "Error: more than 1 userNode / user"
66 let rootId = maybe (panic "error rootId") identity (head rootId'')
68 corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
69 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
71 printDebug "(username, userId, rootId, corpusId"
72 (username, userId, rootId, corpusId)
73 pure (userId, rootId, corpusId)
76 flow :: FilePath -> IO Int
79 (masterUserId, _, corpusId) <- subFlow "gargantua"
81 docs <- map addUniqIds <$> parseDocs WOS fp
82 ids <- runCmd' $ insertDocuments masterUserId corpusId docs
83 printDebug "Docs IDs : " ids
85 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs
86 printDebug "Docs IDs : " idsRepeat
88 (_, _, corpusId2) <- subFlow "alexandre"
90 inserted <- runCmd' $ add corpusId2 (map reId ids)
91 printDebug "Inserted : " inserted
93 runCmd' $ del [corpusId2, corpusId]
95 ----------------------------------------------------------------
98 type ToInsert = Map HashId HyperdataDocument
99 type Inserted = Map HashId ReturnId
101 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
102 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
104 hash = maybe "Error" identity
106 toInserted :: [ReturnId] -> Map HashId ReturnId
107 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
108 $ filter (\r -> reInserted r == True) rs
110 data DocumentWithId = DocumentWithId { documentId :: NodeId
111 , documentData :: HyperdataDocument
115 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
116 mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
118 lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
120 data DocumentIdWithNgrams = DocumentIdWithNgrams { documentWithId :: DocumentWithId
121 , document_ngrams :: Map (NgramsT Ngrams)Int
125 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
126 -> [DocumentWithId] -> [DocumentIdWithNgrams]
127 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
129 -- | TODO check optimization
130 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
131 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
133 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
134 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
136 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
137 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
138 indexNgrams ng2nId = do
139 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
140 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
143 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
144 insertToNodeNgrams m = insertNodeNgrams $ [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
145 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
147 | (ng, nId2int) <- DM.toList m
148 , (nId, n) <- DM.toList nId2int
153 -- insertInto NodeNgramsNgrams
155 -- compute Candidate / Map
156 -- ALTER TABLE nodes_nodes_ngrams ADD COLUMN typelist int;
157 -- insertNodeNodeNgram
159 -- get data of NgramsTable
160 -- post :: update NodeNodeNgrams