]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[FIX][DB][FLOW] insert listngrams.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
14
15 documents
16 sources
17 authors
18
19 -}
20
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
26 where
27
28 import GHC.Show (Show)
29 import System.FilePath (FilePath)
30 import Data.Maybe (Maybe(..), catMaybes)
31 import Data.Text (Text, splitOn)
32 import Data.Map (Map)
33 import Data.Tuple.Extra (both, second)
34 import qualified Data.Map as DM
35
36 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
37 import Gargantext.Database.Bashql (runCmd', del)
38 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
39 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
40 import Gargantext.Database.Node.Document.Add (add)
41 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.User (getUser, UserLight(..), Username)
46 import Gargantext.Prelude
47 import Gargantext.Text.Parsers (parseDocs, FileFormat)
48 import Gargantext.Ext.IMT (toSchoolName)
49
50 type UserId = Int
51 type RootId = Int
52 type CorpusId = Int
53
54 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO Int
55 flowDatabase ff fp cName = do
56
57 -- Corus Flow
58 (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
59
60 -- Documents Flow
61 hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
62
63 --printDebug "hyperdataDocuments" hyperdataDocuments
64
65 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
66 --printDebug "Docs IDs : " (ids)
67 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
68 --printDebug "Repeated Docs IDs : " (length ids)
69
70 -- Ngrams Flow
71 -- todo: flow for new documents only
72 -- let tids = toInserted ids
73 --printDebug "toInserted ids" (length tids, tids)
74
75 -- let tihs = toInsert hyperdataDocuments
76 --printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
77
78 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
79 -- printDebug "documentsWithId" documentsWithId
80
81 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
82 printDebug "docsWithNgrams" docsWithNgrams
83
84 let maps = mapNodeIdNgrams docsWithNgrams
85 printDebug "maps" (maps)
86
87 indexedNgrams <- runCmd' $ indexNgrams maps
88 printDebug "inserted ngrams" indexedNgrams
89 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
90
91 -- List Flow
92 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
93 printDebug "list id : " listId2
94
95 (_, _, corpusId2) <- subFlow "user1" cName
96 inserted <- runCmd' $ add corpusId2 (map reId ids)
97 printDebug "Inserted : " (length inserted)
98
99 pure corpusId2
100 -- runCmd' $ del [corpusId2, corpusId]
101
102 type CorpusName = Text
103
104 subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
105 subFlow username cName = do
106 maybeUserId <- runCmd' (getUser username)
107
108 let userId = case maybeUserId of
109 Nothing -> panic "Error: User does not exist (yet)"
110 -- mk NodeUser gargantua_id "Node Gargantua"
111 Just user -> userLight_id user
112
113 rootId' <- map _node_id <$> runCmd' (getRoot userId)
114
115 rootId'' <- case rootId' of
116 [] -> runCmd' (mkRoot username userId)
117 n -> case length n >= 2 of
118 True -> panic "Error: more than 1 userNode / user"
119 False -> pure rootId'
120 let rootId = maybe (panic "error rootId") identity (head rootId'')
121
122 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
123 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
124
125 printDebug "(username, userId, rootId, corpusId)"
126 (username, userId, rootId, corpusId)
127 pure (userId, rootId, corpusId)
128
129 ------------------------------------------------------------------------
130
131 type HashId = Text
132 type NodeId = Int
133 type ListId = Int
134
135 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
136 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
137 where
138 hash = maybe "Error" identity
139
140 toInserted :: [ReturnId] -> Map HashId ReturnId
141 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
142 $ filter (\r -> reInserted r == True) rs
143
144 data DocumentWithId =
145 DocumentWithId { documentId :: NodeId
146 , documentData :: HyperdataDocument
147 } deriving (Show)
148
149 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
150 mergeData rs hs = map (\(hash,hpd) -> DocumentWithId (lookup' hash rs) hpd) $ DM.toList hs
151 where
152 lookup' h xs = maybe (panic $ message <> h) reId (DM.lookup h xs)
153 message = "Database.Flow.mergeData: Error with "
154
155 ------------------------------------------------------------------------
156
157 data DocumentIdWithNgrams =
158 DocumentIdWithNgrams
159 { documentWithId :: DocumentWithId
160 , document_ngrams :: Map (NgramsT Ngrams) Int
161 } deriving (Show)
162
163 -- TODO add Terms (Title + Abstract)
164 -- add f :: Text -> Text
165 -- newtype Ngrams = Ngrams Text
166 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
167 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
168 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
169 <> [(NgramsT Authors a' , 1)| a' <- authors ]
170 where
171 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
172 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
173 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
174 -- TODO group terms
175
176 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
177 -> [DocumentWithId] -> [DocumentIdWithNgrams]
178 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
179
180 -- | TODO check optimization
181 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
182 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
183 where
184 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
185 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
186
187 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
188 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
189 indexNgrams ng2nId = do
190 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
191 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
192
193
194 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
195 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
196 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
197 | (ng, nId2int) <- DM.toList m
198 , (nId, n) <- DM.toList nId2int
199 ]
200
201
202 ------------------------------------------------------------------------
203 ------------------------------------------------------------------------
204 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
205 listFlow uId cId ngs = do
206 printDebug "ngs:" ngs
207 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
208 printDebug "ngs" (DM.keys ngs)
209 -- TODO add stemming equivalence of 2 ngrams
210 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
211 _ <- insertGroups lId groupEd
212
213 -- compute Candidate / Map
214 let lists = ngrams2list ngs
215 printDebug "lists:" lists
216
217 is <- insertLists lId lists
218 printDebug "listNgrams inserted :" is
219
220 pure lId
221
222 ------------------------------------------------------------------------
223
224 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
225 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
226 -> Map NgramsIndexed NgramsIndexed
227 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
228
229
230
231 -- TODO check: do not insert duplicates
232 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
233 insertGroups lId ngrs =
234 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
235 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
236 , ng1 /= ng2
237 ]
238
239 ------------------------------------------------------------------------
240 -- TODO: verify NgramsT lost here
241 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
242 ngrams2list = zip (repeat Candidate) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
243
244 -- | TODO: weight of the list could be a probability
245 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
246 insertLists lId lngs =
247 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
248 | (l,ngr) <- map (second _ngramsId) lngs
249 ]
250
251 ------------------------------------------------------------------------
252 ------------------------------------------------------------------------
253