]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[NGRAMS] Table queries ready (qualitative tests, needs more tests)
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
14
15 documents
16 sources
17 authors
18
19 -}
20
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
26 where
27
28 import GHC.Show (Show)
29 import System.FilePath (FilePath)
30 import Data.Maybe (Maybe(..), catMaybes)
31 import Data.Text (Text, splitOn)
32 import Data.Map (Map)
33 import Data.Tuple.Extra (both, second)
34 import qualified Data.Map as DM
35
36 import Gargantext.Core.Types (NodePoly(..), ListType(..), listId)
37 import Gargantext.Database.Bashql (runCmd', del)
38 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
39 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList)
40 import Gargantext.Database.Node.Document.Add (add)
41 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.User (getUser, UserLight(..), Username)
46 import Gargantext.Prelude
47 import Gargantext.Text.Parsers (parseDocs, FileFormat)
48 import Gargantext.Ext.IMT (toSchoolName)
49
50 type UserId = Int
51 type RootId = Int
52 type CorpusId = Int
53
54 --flowDatabase :: FileFormat -> FilePath -> CorpusName -> Cmd Int
55 flowDatabase ff fp cName = do
56
57 -- Corus Flow
58 (masterUserId, _, corpusId) <- subFlow "gargantua" "Big Corpus"
59
60 -- Documents Flow
61 hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
62
63 --printDebug "hyperdataDocuments" hyperdataDocuments
64
65 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
66 --printDebug "Docs IDs : " (ids)
67 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
68 --printDebug "Repeated Docs IDs : " (length ids)
69
70 -- Ngrams Flow
71 -- todo: flow for new documents only
72 -- let tids = toInserted ids
73 --printDebug "toInserted ids" (length tids, tids)
74
75 -- let tihs = toInsert hyperdataDocuments
76 --printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
77
78 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
79 -- printDebug "documentsWithId" documentsWithId
80
81 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
82 printDebug "docsWithNgrams" docsWithNgrams
83
84 let maps = mapNodeIdNgrams docsWithNgrams
85 printDebug "maps" (maps)
86
87 indexedNgrams <- runCmd' $ indexNgrams maps
88 printDebug "inserted ngrams" indexedNgrams
89 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
90
91 -- List Flow
92 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
93 printDebug "list id : " listId2
94
95 (userId, _, corpusId2) <- subFlow "user1" cName
96
97 userListId <- runCmd' $ listFlowUser userId corpusId2
98 printDebug "UserList : " userListId
99
100 inserted <- runCmd' $ add corpusId2 (map reId ids)
101 printDebug "Inserted : " (length inserted)
102
103 pure corpusId2
104 -- runCmd' $ del [corpusId2, corpusId]
105
106 type CorpusName = Text
107
108 subFlow :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
109 subFlow username cName = do
110 maybeUserId <- runCmd' (getUser username)
111
112 let userId = case maybeUserId of
113 Nothing -> panic "Error: User does not exist (yet)"
114 -- mk NodeUser gargantua_id "Node Gargantua"
115 Just user -> userLight_id user
116
117 rootId' <- map _node_id <$> runCmd' (getRoot userId)
118
119 rootId'' <- case rootId' of
120 [] -> runCmd' (mkRoot username userId)
121 n -> case length n >= 2 of
122 True -> panic "Error: more than 1 userNode / user"
123 False -> pure rootId'
124 let rootId = maybe (panic "error rootId") identity (head rootId'')
125
126 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
127 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
128
129 printDebug "(username, userId, rootId, corpusId)"
130 (username, userId, rootId, corpusId)
131 pure (userId, rootId, corpusId)
132
133 ------------------------------------------------------------------------
134
135 type HashId = Text
136 type NodeId = Int
137 type ListId = Int
138
139 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
140 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
141 where
142 hash = maybe "Error" identity
143
144 toInserted :: [ReturnId] -> Map HashId ReturnId
145 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
146 $ filter (\r -> reInserted r == True) rs
147
148 data DocumentWithId =
149 DocumentWithId { documentId :: NodeId
150 , documentData :: HyperdataDocument
151 } deriving (Show)
152
153 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
154 mergeData rs hs = map (\(hash,hpd) -> DocumentWithId (lookup' hash rs) hpd) $ DM.toList hs
155 where
156 lookup' h xs = maybe (panic $ message <> h) reId (DM.lookup h xs)
157 message = "Database.Flow.mergeData: Error with "
158
159 ------------------------------------------------------------------------
160
161 data DocumentIdWithNgrams =
162 DocumentIdWithNgrams
163 { documentWithId :: DocumentWithId
164 , document_ngrams :: Map (NgramsT Ngrams) Int
165 } deriving (Show)
166
167 -- TODO add Terms (Title + Abstract)
168 -- add f :: Text -> Text
169 -- newtype Ngrams = Ngrams Text
170 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
171 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
172 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
173 <> [(NgramsT Authors a' , 1)| a' <- authors ]
174 where
175 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
176 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
177 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
178 -- TODO group terms
179
180 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
181 -> [DocumentWithId] -> [DocumentIdWithNgrams]
182 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
183
184 -- | TODO check optimization
185 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
186 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
187 where
188 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
189 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
190
191 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
192 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
193 indexNgrams ng2nId = do
194 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
195 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
196
197
198 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
199 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
200 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
201 | (ng, nId2int) <- DM.toList m
202 , (nId, n) <- DM.toList nId2int
203 ]
204
205
206 ------------------------------------------------------------------------
207 ------------------------------------------------------------------------
208 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
209 listFlow uId cId ngs = do
210 printDebug "ngs:" ngs
211 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
212 printDebug "ngs" (DM.keys ngs)
213 -- TODO add stemming equivalence of 2 ngrams
214 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
215 _ <- insertGroups lId groupEd
216
217 -- compute Candidate / Map
218 let lists = ngrams2list ngs
219 printDebug "lists:" lists
220
221 is <- insertLists lId lists
222 printDebug "listNgrams inserted :" is
223
224 pure lId
225
226 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
227 listFlowUser uId cId = mkList cId uId
228
229 ------------------------------------------------------------------------
230
231 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
232 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
233 -> Map NgramsIndexed NgramsIndexed
234 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
235
236
237
238 -- TODO check: do not insert duplicates
239 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
240 insertGroups lId ngrs =
241 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
242 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
243 , ng1 /= ng2
244 ]
245
246 ------------------------------------------------------------------------
247 -- TODO: verify NgramsT lost here
248 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
249 ngrams2list = zip (repeat Candidate) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
250
251 -- | TODO: weight of the list could be a probability
252 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
253 insertLists lId lngs =
254 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listId l) (listId l)
255 | (l,ngr) <- map (second _ngramsId) lngs
256 ]
257
258 ------------------------------------------------------------------------
259 ------------------------------------------------------------------------
260