]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FLOW][DB][NGRAMS][Lists] Done, need to be tested with real data.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
14
15 documents
16 sources
17 authors
18
19 -}
20
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Database.Flow
26 where
27 import System.FilePath (FilePath)
28 import Data.Maybe (Maybe(..))
29 import Data.Text (Text)
30 import Data.Map (Map)
31 import Data.Tuple.Extra (both, second)
32 import qualified Data.Map as DM
33
34 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
35 import Gargantext.Prelude
36 import Gargantext.Database.Bashql (runCmd', del)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
41 import Gargantext.Database.Node.Document.Add (add)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.NodeNgramNgram (NodeNgramNgramPoly(..), insertNodeNgramNgram)
44
45 import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
46 import Gargantext.Database.Ngram (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId)
47
48 type UserId = Int
49 type RootId = Int
50 type CorpusId = Int
51
52 flow :: FilePath -> IO Int
53 flow fp = do
54
55 (masterUserId, _, corpusId) <- subFlow "gargantua"
56
57 docs <- map addUniqIds <$> parseDocs WOS fp
58 ids <- runCmd' $ insertDocuments masterUserId corpusId docs
59 printDebug "Docs IDs : " ids
60
61 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs
62 printDebug "Docs IDs : " idsRepeat
63
64 (_, _, corpusId2) <- subFlow "alexandre"
65
66 inserted <- runCmd' $ add corpusId2 (map reId ids)
67 printDebug "Inserted : " inserted
68
69 runCmd' $ del [corpusId2, corpusId]
70
71
72 subFlow :: Username -> IO (UserId, RootId, CorpusId)
73 subFlow username = do
74 maybeUserId <- runCmd' (getUser username)
75
76 let userId = case maybeUserId of
77 Nothing -> panic "Error: User does not exist (yet)"
78 -- mk NodeUser gargantua_id "Node Gargantua"
79 Just user -> userLight_id user
80
81 rootId' <- map _node_id <$> runCmd' (getRoot userId)
82
83 rootId'' <- case rootId' of
84 [] -> runCmd' (mkRoot userId)
85 un -> case length un >= 2 of
86 True -> panic "Error: more than 1 userNode / user"
87 False -> pure rootId'
88 let rootId = maybe (panic "error rootId") identity (head rootId'')
89
90 corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
91 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
92
93 printDebug "(username, userId, rootId, corpusId"
94 (username, userId, rootId, corpusId)
95 pure (userId, rootId, corpusId)
96
97 ----------------------------------------------------------------
98 type HashId = Text
99 type NodeId = Int
100 type ListId = Int
101 type ToInsert = Map HashId HyperdataDocument
102 type Inserted = Map HashId ReturnId
103
104 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
105 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
106 where
107 hash = maybe "Error" identity
108
109 toInserted :: [ReturnId] -> Map HashId ReturnId
110 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
111 $ filter (\r -> reInserted r == True) rs
112
113 data DocumentWithId =
114 DocumentWithId
115 { documentId :: NodeId
116 , documentData :: HyperdataDocument
117 }
118
119
120 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
121 mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
122 where
123 lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
124
125 data DocumentIdWithNgrams =
126 DocumentIdWithNgrams
127 { documentWithId :: DocumentWithId
128 , document_ngrams :: Map (NgramsT Ngrams) Int
129 }
130
131 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
132 -> [DocumentWithId] -> [DocumentIdWithNgrams]
133 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
134
135 -- | TODO check optimization
136 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
137 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
138 where
139 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
140 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
141
142 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
143 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
144 indexNgrams ng2nId = do
145 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
146 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
147
148
149 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
150 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
151 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
152
153 | (ng, nId2int) <- DM.toList m
154 , (nId, n) <- DM.toList nId2int
155 ]
156
157 ------------------------------------------------------------------------
158 groupNgramsBy :: (Ngrams -> Ngrams -> Bool)
159 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
160 -> Map NgramsIndexed NgramsIndexed
161 groupNgramsBy = undefined
162
163 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
164 insertGroups lId ngrs =
165 insertNodeNgramNgram $ [ NodeNgramNgram lId ng1 ng2 (Just 1)
166 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
167 ]
168
169 ------------------------------------------------------------------------
170 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map ListType NgramsIndexed
171 ngrams2list = undefined
172
173 -- | TODO: weight of the list could be a probability
174 insertLists :: ListId -> Map ListType NgramsIndexed -> Cmd Int
175 insertLists lId list2ngrams =
176 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
177 | (l,ngr) <- map (second _ngramsId) $ DM.toList list2ngrams
178 ]
179
180
181 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
182 listFlow uId cId ng = do
183 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
184 -- TODO add stemming equivalence of 2 ngrams
185 let groupEd = groupNgramsBy (==) ng
186 _ <- insertGroups lId groupEd
187
188 -- compute Candidate / Map
189 let lists = ngrams2list ng
190 _ <- insertLists lId lists
191
192 pure lId
193 ------------------------------------------------------------------------
194