]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FLOW][DB][NGRAMS] group inserted in db.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
14
15 documents
16 sources
17 authors
18
19 -}
20
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Database.Flow
26 where
27 import System.FilePath (FilePath)
28 import Data.Maybe (Maybe(..))
29 import Data.Text (Text)
30 import Data.Map (Map)
31 import Data.Tuple.Extra (both)
32 import qualified Data.Map as DM
33
34 import Gargantext.Core.Types (NodePoly(..))
35 import Gargantext.Prelude
36 import Gargantext.Database.Bashql (runCmd', del)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
41 import Gargantext.Database.Node.Document.Add (add)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.NodeNgramNgram (NodeNgramNgramPoly(..), insertNodeNgramNgram)
44
45 import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
46 import Gargantext.Database.Ngram (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId)
47
48 type UserId = Int
49 type RootId = Int
50 type CorpusId = Int
51
52 flow :: FilePath -> IO Int
53 flow fp = do
54
55 (masterUserId, _, corpusId) <- subFlow "gargantua"
56
57 docs <- map addUniqIds <$> parseDocs WOS fp
58 ids <- runCmd' $ insertDocuments masterUserId corpusId docs
59 printDebug "Docs IDs : " ids
60
61 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs
62 printDebug "Docs IDs : " idsRepeat
63
64 (_, _, corpusId2) <- subFlow "alexandre"
65
66 inserted <- runCmd' $ add corpusId2 (map reId ids)
67 printDebug "Inserted : " inserted
68
69 runCmd' $ del [corpusId2, corpusId]
70
71
72 subFlow :: Username -> IO (UserId, RootId, CorpusId)
73 subFlow username = do
74 maybeUserId <- runCmd' (getUser username)
75
76 let userId = case maybeUserId of
77 Nothing -> panic "Error: User does not exist (yet)"
78 -- mk NodeUser gargantua_id "Node Gargantua"
79 Just user -> userLight_id user
80
81 rootId' <- map _node_id <$> runCmd' (getRoot userId)
82
83 rootId'' <- case rootId' of
84 [] -> runCmd' (mkRoot userId)
85 un -> case length un >= 2 of
86 True -> panic "Error: more than 1 userNode / user"
87 False -> pure rootId'
88 let rootId = maybe (panic "error rootId") identity (head rootId'')
89
90 corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
91 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
92
93 printDebug "(username, userId, rootId, corpusId"
94 (username, userId, rootId, corpusId)
95 pure (userId, rootId, corpusId)
96
97 ----------------------------------------------------------------
98 type HashId = Text
99 type NodeId = Int
100 type ListId = Int
101 type ToInsert = Map HashId HyperdataDocument
102 type Inserted = Map HashId ReturnId
103
104 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
105 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
106 where
107 hash = maybe "Error" identity
108
109 toInserted :: [ReturnId] -> Map HashId ReturnId
110 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
111 $ filter (\r -> reInserted r == True) rs
112
113 data DocumentWithId =
114 DocumentWithId
115 { documentId :: NodeId
116 , documentData :: HyperdataDocument
117 }
118
119
120 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
121 mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
122 where
123 lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
124
125 data DocumentIdWithNgrams =
126 DocumentIdWithNgrams
127 { documentWithId :: DocumentWithId
128 , document_ngrams :: Map (NgramsT Ngrams) Int
129 }
130
131 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
132 -> [DocumentWithId] -> [DocumentIdWithNgrams]
133 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
134
135 -- | TODO check optimization
136 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
137 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
138 where
139 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
140 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
141
142 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
143 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
144 indexNgrams ng2nId = do
145 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
146 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
147
148
149 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
150 insertToNodeNgrams m = insertNodeNgrams $ [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
151 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
152
153 | (ng, nId2int) <- DM.toList m
154 , (nId, n) <- DM.toList nId2int
155 ]
156
157 ------------------------------------------------------------------------
158 groupNgramsBy :: (Ngrams -> Ngrams -> Bool) -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map NgramsIndexed NgramsIndexed
159 groupNgramsBy = undefined
160
161 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
162 insertGroups lId ngrs =
163 insertNodeNgramNgram $ [ NodeNgramNgram lId ng1 ng2 (Just 1)
164 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
165 ]
166
167 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
168 listFlow uId cId ng = do
169 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
170 -- TODO add stemming equivalence of 2 ngrams
171 let groupEd = groupNgramsBy (==) ng
172
173 _ <- insertGroups lId groupEd
174
175 -- compute Candidate / Map
176 -- ALTER TABLE nodes_nodes_ngrams ADD COLUMN typelist int;
177 -- insertLists = NodeNodeNgram
178
179 pure lId
180
181
182 -- | TODO ask on meeting
183 -- get data of NgramsTable
184 -- post :: update NodeNodeNgrams
185 -- group ngrams
186