]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[DB][Flow] question about the map of ngramsT.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
14
15 documents
16 sources
17 authors
18
19 -}
20
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Database.Flow (flowDatabase)
26 where
27
28 import GHC.Show (Show)
29 import System.FilePath (FilePath)
30 import Data.Maybe (Maybe(..), catMaybes)
31 import Data.Text (Text, splitOn)
32 import Data.Map (Map)
33 import Data.Tuple.Extra (both, second)
34 import qualified Data.Map as DM
35
36 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
37 import Gargantext.Database.Bashql (runCmd')--, del)
38 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
39 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
40 import Gargantext.Database.Node.Document.Add (add)
41 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.User (getUser, UserLight(..), Username)
46 import Gargantext.Prelude
47 import Gargantext.Text.Parsers (parseDocs, FileFormat)
48 import Gargantext.Ext.IMT (toSchoolName)
49
50 type UserId = Int
51 type RootId = Int
52 type CorpusId = Int
53
54 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO [Int]
55 flowDatabase ff fp cName = do
56
57 -- Corus Flow
58 (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
59
60 -- Documents Flow
61 hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
62
63 --printDebug "hyperdataDocuments" hyperdataDocuments
64
65 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
66 --printDebug "Docs IDs : " (ids)
67 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
68 --printDebug "Repeated Docs IDs : " (length ids)
69
70 -- Ngrams Flow
71 -- todo: flow for new documents only
72 let tids = toInserted ids
73 --printDebug "toInserted ids" (length tids, tids)
74
75 let tihs = toInsert hyperdataDocuments
76 --printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
77
78 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
79 -- printDebug "documentsWithId" documentsWithId
80
81 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
82 printDebug "docsWithNgrams" docsWithNgrams
83 {-
84
85 let maps = mapNodeIdNgrams docsWithNgrams
86 printDebug "maps" (maps)
87
88 indexedNgrams <- runCmd' $ indexNgrams maps
89 printDebug "inserted ngrams" indexedNgrams
90 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
91
92 -- List Flow
93 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
94 printDebug "list id : " listId2
95
96 printDebug "Docs IDs : " (length idsRepeat)
97
98 -}
99 (_, _, corpusId2) <- subFlow "user1" cName
100 {-
101 inserted <- runCmd' $ add corpusId2 (map reId ids)
102 printDebug "Inserted : " (length inserted)
103 -}
104 pure [corpusId2, corpusId]
105
106 --runCmd' $ del [corpusId2, corpusId]
107
108 type CorpusName = Text
109
110 subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
111 subFlow username cName = do
112 maybeUserId <- runCmd' (getUser username)
113
114 let userId = case maybeUserId of
115 Nothing -> panic "Error: User does not exist (yet)"
116 -- mk NodeUser gargantua_id "Node Gargantua"
117 Just user -> userLight_id user
118
119 rootId' <- map _node_id <$> runCmd' (getRoot userId)
120
121 rootId'' <- case rootId' of
122 [] -> runCmd' (mkRoot username userId)
123 n -> case length n >= 2 of
124 True -> panic "Error: more than 1 userNode / user"
125 False -> pure rootId'
126 let rootId = maybe (panic "error rootId") identity (head rootId'')
127
128 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
129 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
130
131 printDebug "(username, userId, rootId, corpusId)"
132 (username, userId, rootId, corpusId)
133 pure (userId, rootId, corpusId)
134
135 ------------------------------------------------------------------------
136
137 type HashId = Text
138 type NodeId = Int
139 type ListId = Int
140
141 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
142 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
143 where
144 hash = maybe "Error" identity
145
146 toInserted :: [ReturnId] -> Map HashId ReturnId
147 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
148 $ filter (\r -> reInserted r == True) rs
149
150 data DocumentWithId =
151 DocumentWithId { documentId :: NodeId
152 , documentData :: HyperdataDocument
153 } deriving (Show)
154
155 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
156 mergeData rs hs = map (\(hash,hpd) -> DocumentWithId (lookup' hash rs) hpd) $ DM.toList hs
157 where
158 lookup' h xs = maybe (panic $ "Database.Flow.mergeData: Error with " <> h) reId (DM.lookup h rs)
159
160 ------------------------------------------------------------------------
161
162 data DocumentIdWithNgrams =
163 DocumentIdWithNgrams
164 { documentWithId :: DocumentWithId
165 , document_ngrams :: Map (NgramsT Ngrams) Int
166 } deriving (Show)
167
168 -- TODO add Terms (Title + Abstract)
169 -- add f :: Text -> Text
170 -- newtype Ngrams = Ngrams Text
171 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
172 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
173 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
174 <> [(NgramsT Authors a' , 1)| a' <- authors ]
175 where
176 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
177 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
178 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
179 -- TODO group terms
180
181 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
182 -> [DocumentWithId] -> [DocumentIdWithNgrams]
183 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
184
185 -- | TODO check optimization
186 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
187 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
188 where
189 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
190 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
191
192 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
193 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
194 indexNgrams ng2nId = do
195 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
196 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
197
198
199 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
200 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
201 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
202 | (ng, nId2int) <- DM.toList m
203 , (nId, n) <- DM.toList nId2int
204 ]
205
206
207 ------------------------------------------------------------------------
208 ------------------------------------------------------------------------
209 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
210 listFlow uId cId ng = do
211 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
212 -- TODO add stemming equivalence of 2 ngrams
213 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ng
214 _ <- insertGroups lId groupEd
215
216 -- compute Candidate / Map
217 let lists = ngrams2list ng
218 _ <- insertLists lId lists
219
220 pure lId
221
222 ------------------------------------------------------------------------
223
224 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
225 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
226 -> Map NgramsIndexed NgramsIndexed
227 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
228
229
230
231 -- TODO check: do not insert duplicates
232 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
233 insertGroups lId ngrs =
234 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
235 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
236 ]
237
238 ------------------------------------------------------------------------
239 -- TODO: verify NgramsT lost here
240 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Map ListType NgramsIndexed
241 ngrams2list = DM.fromList . zip (repeat Candidate) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
242
243 -- | TODO: weight of the list could be a probability
244 insertLists :: ListId -> Map ListType NgramsIndexed -> Cmd Int
245 insertLists lId list2ngrams =
246 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
247 | (l,ngr) <- map (second _ngramsId) $ DM.toList list2ngrams
248 ]
249
250 ------------------------------------------------------------------------
251 ------------------------------------------------------------------------
252