]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Flow.hs
[Clean] before factoring
[gargantext.git] / src / Gargantext / Database / Action / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -- TODO-ACCESS:
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
13
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
16 -}
17
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
19
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29
30 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
31 ( FlowCmdM
32 , flowCorpusFile
33 , flowCorpus
34 , flowCorpusSearchInDatabase
35 , getOrMkRoot
36 , getOrMk_RootWithCorpus
37 , flowAnnuaire
38 )
39 where
40
41 import Control.Lens ((^.), view, _Just)
42 import Data.Either
43 import Data.List (concat)
44 import Data.Map (Map, lookup)
45 import Data.Maybe (Maybe(..), catMaybes)
46 import Data.Monoid
47 import Data.Text (Text, splitOn, intercalate)
48 import Data.Traversable (traverse)
49 import Data.Tuple.Extra (first, second)
50 import Debug.Trace (trace)
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Core.Flow.Types
53 import Gargantext.Core.Types (Terms(..))
54 import Gargantext.Core.Types.Individu (User(..))
55 import Gargantext.Core.Types.Main
56 import Gargantext.Database.Action.Flow.List
57 import Gargantext.Database.Action.Flow.Types
58 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
59 import Gargantext.Database.Action.Query.Node
60 import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
61 import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
62 import Gargantext.Database.Action.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
63 import Gargantext.Database.Action.Search (searchInDatabase)
64 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
65 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..))
66 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
67 import Gargantext.Database.Admin.Utils (Cmd)
68 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
69 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
70 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
71 import Gargantext.Ext.IMT (toSchoolName)
72 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
73 import Gargantext.Prelude
74 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
75 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
76 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
77 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
78 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
79 import Prelude (String)
80 import System.FilePath (FilePath)
81 import qualified Data.List as List
82 import qualified Data.Map as Map
83 import qualified Data.Text as Text
84 import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
85 import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
86
87 ------------------------------------------------------------------------
88
89 data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
90 -- | APIs
91 -- TODO instances
92 getDataApi :: Lang
93 -> Maybe Limit
94 -> ApiQuery
95 -> IO [HyperdataDocument]
96 getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
97 getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
98
99
100 -- UNUSED
101 _flowCorpusApi :: ( FlowCmdM env err m)
102 => User -> Either CorpusName [CorpusId]
103 -> TermType Lang
104 -> Maybe Limit
105 -> ApiQuery
106 -> m CorpusId
107 _flowCorpusApi u n tt l q = do
108 docs <- liftBase $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
109 flowCorpus u n tt docs
110
111 ------------------------------------------------------------------------
112 flowAnnuaire :: FlowCmdM env err m
113 => User
114 -> Either CorpusName [CorpusId]
115 -> (TermType Lang)
116 -> FilePath
117 -> m AnnuaireId
118 flowAnnuaire u n l filePath = do
119 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
120 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
121 ------------------------------------------------------------------------
122
123 flowCorpusFile :: FlowCmdM env err m
124 => User -> Either CorpusName [CorpusId]
125 -> Limit -- Limit the number of docs (for dev purpose)
126 -> TermType Lang -> FileFormat -> FilePath
127 -> m CorpusId
128 flowCorpusFile u n l la ff fp = do
129 docs <- liftBase ( splitEvery 500
130 <$> take l
131 <$> parseFile ff fp
132 )
133 flowCorpus u n la (map (map toHyperdataDocument) docs)
134
135 -- TODO query with complex query
136 flowCorpusSearchInDatabase :: FlowCmdM env err m
137 => User
138 -> Lang
139 -> Text
140 -> m CorpusId
141 flowCorpusSearchInDatabase u la q = do
142 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
143 (UserName userMaster)
144 (Left "")
145 (Nothing :: Maybe HyperdataCorpus)
146 ids <- map fst <$> searchInDatabase cId (stemIt q)
147 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
148
149
150 -- UNUSED
151 _flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
152 => User
153 -> Lang
154 -> Text
155 -> m CorpusId
156 _flowCorpusSearchInDatabaseApi u la q = do
157 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
158 (UserName userMaster)
159 (Left "")
160 (Nothing :: Maybe HyperdataCorpus)
161 ids <- map fst <$> searchInDatabase cId (stemIt q)
162 flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
163
164 ------------------------------------------------------------------------
165 -- | TODO improve the needed type to create/update a corpus
166 {- UNUSED
167 data UserInfo = Username Text
168 | UserId NodeId
169 data CorpusInfo = CorpusName Lang Text
170 | CorpusId Lang NodeId
171 -}
172
173 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
174 => Maybe c
175 -> User
176 -> Either CorpusName [CorpusId]
177 -> TermType Lang
178 -> [[a]]
179 -> m CorpusId
180 flow c u cn la docs = do
181 ids <- traverse (insertMasterDocs c la ) docs
182 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
183
184 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
185 => User
186 -> Either CorpusName [CorpusId]
187 -> TermType Lang
188 -> [[a]]
189 -> m CorpusId
190 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
191
192 ------------------------------------------------------------------------
193 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
194 => Lang
195 -> User
196 -> Either CorpusName [CorpusId]
197 -> Maybe c
198 -> [NodeId]
199 -> m CorpusId
200 flowCorpusUser l user corpusName ctype ids = do
201 -- User Flow
202 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
203 listId <- getOrMkList userCorpusId userId
204 _cooc <- mkNode NodeListCooc listId userId
205 -- TODO: check if present already, ignore
206 _ <- Doc.add userCorpusId ids
207
208 _tId <- mkNode NodeTexts userCorpusId userId
209 -- printDebug "Node Text Id" tId
210
211 -- User List Flow
212 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
213 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
214 _userListId <- flowList_DbRepo listId ngs
215 _mastListId <- getOrMkList masterCorpusId masterUserId
216 -- _ <- insertOccsUpdates userCorpusId mastListId
217 -- printDebug "userListId" userListId
218 -- User Graph Flow
219 _ <- mkDashboard userCorpusId userId
220 _ <- mkGraph userCorpusId userId
221 --_ <- mkPhylo userCorpusId userId
222
223 -- Annuaire Flow
224 -- _ <- mkAnnuaire rootUserId userId
225 pure userCorpusId
226
227
228 insertMasterDocs :: ( FlowCmdM env err m
229 , FlowCorpus a
230 , MkCorpus c
231 )
232 => Maybe c
233 -> TermType Lang
234 -> [a]
235 -> m [DocId]
236 insertMasterDocs c lang hs = do
237 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
238
239 -- TODO Type NodeDocumentUnicised
240 let docs = map addUniqId hs
241 ids <- insertDb masterUserId masterCorpusId docs
242 let
243 ids' = map reId ids
244 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
245 -- TODO
246 -- create a corpus with database name (CSV or PubMed)
247 -- add documents to the corpus (create node_node link)
248 -- this will enable global database monitoring
249
250 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
251 maps <- mapNodeIdNgrams
252 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
253
254 terms2id <- insertNgrams $ Map.keys maps
255 -- to be removed
256 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
257
258 -- new
259 lId <- getOrMkList masterCorpusId masterUserId
260 mapCgramsId <- listInsertDb lId toNodeNgramsW'
261 $ map (first _ngramsTerms . second Map.keys)
262 $ Map.toList maps
263 -- insertDocNgrams
264 _return <- insertNodeNodeNgrams2
265 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
266 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
267 <*> Just (fromIntegral w :: Double)
268 | (terms, mapNgramsTypes) <- Map.toList maps
269 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
270 , (nId, w) <- Map.toList mapNodeIdWeight
271 ]
272
273 _ <- Doc.add masterCorpusId ids'
274 _cooc <- mkNode NodeListCooc lId masterUserId
275 -- to be removed
276 _ <- insertDocNgrams lId indexedNgrams
277
278 pure ids'
279
280
281 withLang :: HasText a => TermType Lang
282 -> [DocumentWithId a]
283 -> TermType Lang
284 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
285 where
286 m' = case m of
287 Nothing -> trace ("buildTries here" :: String)
288 $ Just
289 $ buildTries n ( fmap toToken $ uniText
290 $ Text.intercalate " . "
291 $ List.concat
292 $ map hasText ns
293 )
294 just_m -> just_m
295 withLang l _ = l
296
297
298 ------------------------------------------------------------------------
299 viewUniqId' :: UniqId a
300 => a
301 -> (HashId, a)
302 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
303 where
304 err = panic "[ERROR] Database.Flow.toInsert"
305
306
307 toInserted :: [ReturnId]
308 -> Map HashId ReturnId
309 toInserted =
310 Map.fromList . map (\r -> (reUniqId r, r) )
311 . filter (\r -> reInserted r == True)
312
313 mergeData :: Map HashId ReturnId
314 -> Map HashId a
315 -> [DocumentWithId a]
316 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
317 where
318 toDocumentWithId (sha,hpd) =
319 DocumentWithId <$> fmap reId (lookup sha rs)
320 <*> Just hpd
321
322 ------------------------------------------------------------------------
323
324 instance HasText HyperdataContact
325 where
326 hasText = undefined
327
328 instance ExtractNgramsT HyperdataContact
329 where
330 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
331 where
332 extract :: TermType Lang -> HyperdataContact
333 -> Cmd err (Map Ngrams (Map NgramsType Int))
334 extract _l hc' = do
335 let authors = map text2ngrams
336 $ maybe ["Nothing"] (\a -> [a])
337 $ view (hc_who . _Just . cw_lastName) hc'
338
339 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
340
341 instance HasText HyperdataDocument
342 where
343 hasText h = catMaybes [ _hyperdataDocument_title h
344 , _hyperdataDocument_abstract h
345 ]
346
347 instance ExtractNgramsT HyperdataDocument
348 where
349 extractNgramsT :: TermType Lang
350 -> HyperdataDocument
351 -> Cmd err (Map Ngrams (Map NgramsType Int))
352 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
353 where
354 extractNgramsT' :: TermType Lang
355 -> HyperdataDocument
356 -> Cmd err (Map Ngrams (Map NgramsType Int))
357 extractNgramsT' lang' doc = do
358 let source = text2ngrams
359 $ maybe "Nothing" identity
360 $ _hyperdataDocument_source doc
361
362 institutes = map text2ngrams
363 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
364 $ _hyperdataDocument_institutes doc
365
366 authors = map text2ngrams
367 $ maybe ["Nothing"] (splitOn ", ")
368 $ _hyperdataDocument_authors doc
369
370 terms' <- map text2ngrams
371 <$> map (intercalate " " . _terms_label)
372 <$> concat
373 <$> liftBase (extractTerms lang' $ hasText doc)
374
375 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
376 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
377 <> [(a', Map.singleton Authors 1) | a' <- authors ]
378 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
379
380 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
381 -> Map Ngrams (Map NgramsType Int)
382 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
383 where
384 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
385 True -> (ng,y)
386 False -> (Ngrams (Text.take s' t) n , y)
387
388
389 documentIdWithNgrams :: HasNodeError err
390 => (a
391 -> Cmd err (Map Ngrams (Map NgramsType Int)))
392 -> [DocumentWithId a]
393 -> Cmd err [DocumentIdWithNgrams a]
394 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
395 where
396 toDocumentIdWithNgrams d = do
397 e <- f $ documentData d
398 pure $ DocumentIdWithNgrams d e
399