]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[SearchIn] Corpus API route : corpus/{id}/search
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
17 where
18
19 --import Control.Lens (view)
20 --import Gargantext.Core.Types
21 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
22 import Data.Map (Map, lookup)
23 import Data.Maybe (Maybe(..), catMaybes)
24 import Data.Text (Text, splitOn, intercalate)
25 import Data.Tuple.Extra (both, second)
26 import Data.List (concat)
27 import GHC.Show (Show)
28 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
29 import Gargantext.Core.Types.Individu (Username)
30 import Gargantext.Core.Types.Main
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Database.Bashql (runCmd') -- , del)
33 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
34 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
35 import Gargantext.Text.Terms (extractTerms)
36 import Gargantext.Database.Node.Document.Add (add)
37 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
38 import Gargantext.Database.Root (getRootCmd)
39 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
40 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
41 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
42 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
43 import Gargantext.Database.Schema.User (getUser, UserLight(..))
44 import Gargantext.Database.Types.Node (HyperdataDocument(..))
45 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
46 import Gargantext.Database.Utils (Cmd(..))
47 import Gargantext.Text.Terms (TermType(..))
48 import Gargantext.Ext.IMT (toSchoolName)
49 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
50 import Gargantext.Prelude
51 import Gargantext.Text.Parsers (parseDocs, FileFormat)
52 import System.FilePath (FilePath)
53 import qualified Data.Map as DM
54
55 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
56 flowCorpus ff fp cName = do
57 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
58 params <- flowInsert NodeCorpus hyperdataDocuments' cName
59 flowCorpus' NodeCorpus hyperdataDocuments' params
60
61
62 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
63 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
64 flowInsert _nt hyperdataDocuments cName = do
65 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
66
67 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
68 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
69
70 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
71 _ <- runCmd' $ add userCorpusId (map reId ids)
72
73 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
74
75
76 flowAnnuaire :: FilePath -> IO ()
77 flowAnnuaire filePath = do
78 contacts <- deserialiseImtUsersFromFile filePath
79 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
80 printDebug "length annuaire" ps
81
82
83 flowInsertAnnuaire :: CorpusName -> [ToDbData]
84 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
85 flowInsertAnnuaire name children = do
86
87 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
88 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
89
90 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
91 _ <- runCmd' $ add userCorpusId (map reId ids)
92
93 printDebug "AnnuaireID" userCorpusId
94
95 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
96
97
98 flowCorpus' :: NodeType -> [HyperdataDocument]
99 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
100 -> IO CorpusId
101 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
102 --------------------------------------------------
103 -- List Ngrams Flow
104 userListId <- runCmd' $ flowListUser userId userCorpusId
105 printDebug "Working on User ListId : " userListId
106
107 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
108 -- printDebug "documentsWithId" documentsWithId
109 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
110 -- printDebug "docsWithNgrams" docsWithNgrams
111 let maps = mapNodeIdNgrams docsWithNgrams
112
113 -- printDebug "maps" (maps)
114 indexedNgrams <- runCmd' $ indexNgrams maps
115 -- printDebug "inserted ngrams" indexedNgrams
116 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
117
118 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
119 printDebug "Working on ListId : " listId2
120 --}
121 --------------------------------------------------
122 _ <- runCmd' $ mkDashboard userCorpusId userId
123 _ <- runCmd' $ mkGraph userCorpusId userId
124
125 -- Annuaire Flow
126 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
127
128 pure userCorpusId
129 -- runCmd' $ del [corpusId2, corpusId]
130
131 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
132 flowCorpus' _ _ _ = undefined
133
134
135 type CorpusName = Text
136
137 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
138 subFlowCorpus username cName = do
139 maybeUserId <- runCmd' (getUser username)
140
141 let userId = case maybeUserId of
142 Nothing -> panic "Error: User does not exist (yet)"
143 -- mk NodeUser gargantua_id "Node Gargantua"
144 Just user -> userLight_id user
145
146 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
147
148 rootId'' <- case rootId' of
149 [] -> runCmd' (mkRoot username userId)
150 n -> case length n >= 2 of
151 True -> panic "Error: more than 1 userNode / user"
152 False -> pure rootId'
153 let rootId = maybe (panic "error rootId") identity (head rootId'')
154
155 corpusId'' <- if username == userMaster
156 then do
157 ns <- runCmd' $ getCorporaWithParentId' rootId
158 pure $ map _node_id ns
159 else
160 pure []
161
162 corpusId' <- if corpusId'' /= []
163 then pure corpusId''
164 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
165
166 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
167
168 printDebug "(username, userId, rootId, corpusId)"
169 (username, userId, rootId, corpusId)
170 pure (userId, rootId, corpusId)
171
172
173 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
174 subFlowAnnuaire username _cName = do
175 maybeUserId <- runCmd' (getUser username)
176
177 let userId = case maybeUserId of
178 Nothing -> panic "Error: User does not exist (yet)"
179 -- mk NodeUser gargantua_id "Node Gargantua"
180 Just user -> userLight_id user
181
182 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
183
184 rootId'' <- case rootId' of
185 [] -> runCmd' (mkRoot username userId)
186 n -> case length n >= 2 of
187 True -> panic "Error: more than 1 userNode / user"
188 False -> pure rootId'
189 let rootId = maybe (panic "error rootId") identity (head rootId'')
190
191 corpusId' <- runCmd' $ mkAnnuaire rootId userId
192
193 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
194
195 printDebug "(username, userId, rootId, corpusId)"
196 (username, userId, rootId, corpusId)
197 pure (userId, rootId, corpusId)
198
199
200
201 ------------------------------------------------------------------------
202 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
203 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
204 where
205 err = "Database.Flow.toInsert"
206
207 toInserted :: [ReturnId] -> Map HashId ReturnId
208 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
209 . filter (\r -> reInserted r == True)
210
211 data DocumentWithId =
212 DocumentWithId { documentId :: NodeId
213 , documentData :: HyperdataDocument
214 } deriving (Show)
215
216 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
217 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
218 where
219 toDocumentWithId (hash,hpd) =
220 DocumentWithId <$> fmap reId (lookup hash rs)
221 <*> Just hpd
222
223 ------------------------------------------------------------------------
224
225 data DocumentIdWithNgrams =
226 DocumentIdWithNgrams
227 { documentWithId :: DocumentWithId
228 , document_ngrams :: Map (NgramsT Ngrams) Int
229 } deriving (Show)
230
231 -- TODO add Terms (Title + Abstract)
232 -- add f :: Text -> Text
233 -- newtype Ngrams = Ngrams Text
234 -- TODO group terms
235 extractNgramsT :: HyperdataDocument -> IO (Map (NgramsT Ngrams) Int)
236 extractNgramsT doc = do
237
238 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
239 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
240 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
241 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
242 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> extractTerms (Multi EN) leText
243
244 pure $ DM.fromList $ [(NgramsT Sources source, 1)]
245 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
246 <> [(NgramsT Authors a' , 1)| a' <- authors ]
247 <> [(NgramsT NgramsTerms t' , 1)| t' <- terms' ]
248
249
250
251
252 documentIdWithNgrams :: (HyperdataDocument -> IO (Map (NgramsT Ngrams) Int))
253 -> [DocumentWithId] -> IO [DocumentIdWithNgrams]
254 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
255 where
256 toDocumentIdWithNgrams d = do
257 e <- f $ documentData d
258 pure $ DocumentIdWithNgrams d e
259
260 -- | TODO check optimization
261 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
262 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
263 where
264 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
265 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
266
267 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
268 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
269 indexNgrams ng2nId = do
270 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
271 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
272
273
274 ------------------------------------------------------------------------
275 ------------------------------------------------------------------------
276 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
277 flowList uId cId ngs = do
278 -- printDebug "ngs:" ngs
279 lId <- getOrMkList cId uId
280 --printDebug "ngs" (DM.keys ngs)
281 -- TODO add stemming equivalence of 2 ngrams
282 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
283 _ <- insertGroups lId groupEd
284
285 -- compute Candidate / Map
286 let lists = ngrams2list ngs
287 -- printDebug "lists:" lists
288
289 is <- insertLists lId lists
290 printDebug "listNgrams inserted :" is
291
292 pure lId
293
294 flowListUser :: UserId -> CorpusId -> Cmd Int
295 flowListUser uId cId = getOrMkList cId uId
296
297 ------------------------------------------------------------------------
298
299 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
300 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
301 -> Map NgramsIndexed NgramsIndexed
302 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
303
304
305
306 -- TODO check: do not insert duplicates
307 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
308 insertGroups lId ngrs =
309 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
310 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
311 , ng1 /= ng2
312 ]
313
314 ------------------------------------------------------------------------
315 -- TODO: verify NgramsT lost here
316 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
317 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
318
319 -- | TODO: weight of the list could be a probability
320 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
321 insertLists lId lngs =
322 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
323 | (l,ngr) <- map (second _ngramsId) lngs
324 ]
325
326 ------------------------------------------------------------------------
327 ------------------------------------------------------------------------
328