2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29 {-# LANGUAGE TemplateHaskell #-}
31 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
41 , getOrMk_RootWithCorpus
50 import Control.Lens ((^.), view, _Just, makeLenses)
51 import Data.Aeson.TH (deriveJSON)
53 import Data.List (concat)
54 import Data.Map (Map, lookup)
55 import Data.Maybe (Maybe(..), catMaybes)
58 import Data.Text (splitOn, intercalate)
59 import Data.Traversable (traverse)
60 import Data.Tuple.Extra (first, second)
61 import Debug.Trace (trace)
62 import Gargantext.Core (Lang(..))
63 import Gargantext.Core.Flow.Types
64 import Gargantext.Core.Types (Terms(..))
65 import Gargantext.Core.Types.Individu (User(..))
66 import Gargantext.Core.Types.Main
67 import Gargantext.Database.Action.Flow.List
68 import Gargantext.Database.Action.Flow.Types
69 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
70 import Gargantext.Database.Action.Query.Node
71 import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
72 import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
73 import Gargantext.Database.Action.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
74 import Gargantext.Database.Action.Search (searchInDatabase)
75 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
76 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..))
77 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
78 import Gargantext.Database.Admin.Utils (Cmd)
79 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
80 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
81 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
82 import Gargantext.Ext.IMT (toSchoolName)
83 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
84 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
85 import Gargantext.Prelude
86 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
87 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
88 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
89 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
90 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
91 import GHC.Generics (Generic)
92 import Prelude (String)
93 import System.FilePath (FilePath)
94 import qualified Data.List as List
95 import qualified Data.Map as Map
96 import qualified Data.Text as Text
97 import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
98 import qualified Gargantext.Text.Corpus.API as API
100 ------------------------------------------------------------------------
101 -- TODO use internal with API name (could be old data)
102 data DataOrigin = InternalOrigin { _do_api :: API.ExternalAPIs }
103 | ExternalOrigin { _do_api :: API.ExternalAPIs }
105 deriving (Generic, Eq)
107 makeLenses ''DataOrigin
108 deriveJSON (unPrefix "_do_") ''DataOrigin
109 instance ToSchema DataOrigin where
110 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_do_")
112 allDataOrigins :: [DataOrigin]
113 allDataOrigins = map InternalOrigin API.externalAPIs
114 <> map ExternalOrigin API.externalAPIs
118 data DataText = DataOld ![NodeId]
119 | DataNew ![[HyperdataDocument]]
122 -- TODO use the split parameter in config file
123 getDataText :: FlowCmdM env err m
129 getDataText (ExternalOrigin api) la q li = liftBase $ DataNew
131 <$> API.get api (_tt_lang la) q li
132 getDataText (InternalOrigin _) _la q _li = do
133 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
134 (UserName userMaster)
136 (Nothing :: Maybe HyperdataCorpus)
137 ids <- map fst <$> searchInDatabase cId (stemIt q)
140 -------------------------------------------------------------------------------
141 flowDataText :: FlowCmdM env err m
147 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
149 corpusType = (Nothing :: Maybe HyperdataCorpus)
150 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
152 ------------------------------------------------------------------------
154 flowAnnuaire :: FlowCmdM env err m
156 -> Either CorpusName [CorpusId]
160 flowAnnuaire u n l filePath = do
161 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
162 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
164 ------------------------------------------------------------------------
165 flowCorpusFile :: FlowCmdM env err m
167 -> Either CorpusName [CorpusId]
168 -> Limit -- Limit the number of docs (for dev purpose)
169 -> TermType Lang -> FileFormat -> FilePath
171 flowCorpusFile u n l la ff fp = do
172 docs <- liftBase ( splitEvery 500
176 flowCorpus u n la (map (map toHyperdataDocument) docs)
178 ------------------------------------------------------------------------
179 -- | TODO improve the needed type to create/update a corpus
180 -- (For now, Either is enough)
181 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
183 -> Either CorpusName [CorpusId]
187 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
190 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
193 -> Either CorpusName [CorpusId]
197 flow c u cn la docs = do
198 ids <- traverse (insertMasterDocs c la) docs
199 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
201 ------------------------------------------------------------------------
202 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
205 -> Either CorpusName [CorpusId]
209 flowCorpusUser l user corpusName ctype ids = do
211 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
212 listId <- getOrMkList userCorpusId userId
213 _cooc <- mkNode NodeListCooc listId userId
214 -- TODO: check if present already, ignore
215 _ <- Doc.add userCorpusId ids
217 _tId <- mkNode NodeTexts userCorpusId userId
218 -- printDebug "Node Text Id" tId
221 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
222 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
223 _userListId <- flowList_DbRepo listId ngs
224 _mastListId <- getOrMkList masterCorpusId masterUserId
225 -- _ <- insertOccsUpdates userCorpusId mastListId
226 -- printDebug "userListId" userListId
228 _ <- mkDashboard userCorpusId userId
229 _ <- mkGraph userCorpusId userId
230 --_ <- mkPhylo userCorpusId userId
233 -- _ <- mkAnnuaire rootUserId userId
237 insertMasterDocs :: ( FlowCmdM env err m
245 insertMasterDocs c lang hs = do
246 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
248 -- TODO Type NodeDocumentUnicised
249 let docs = map addUniqId hs
250 ids <- insertDb masterUserId masterCorpusId docs
253 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
255 -- create a corpus with database name (CSV or PubMed)
256 -- add documents to the corpus (create node_node link)
257 -- this will enable global database monitoring
259 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
260 maps <- mapNodeIdNgrams
261 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
263 terms2id <- insertNgrams $ Map.keys maps
265 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
268 lId <- getOrMkList masterCorpusId masterUserId
269 mapCgramsId <- listInsertDb lId toNodeNgramsW'
270 $ map (first _ngramsTerms . second Map.keys)
273 _return <- insertNodeNodeNgrams2
274 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
275 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
276 <*> Just (fromIntegral w :: Double)
277 | (terms, mapNgramsTypes) <- Map.toList maps
278 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
279 , (nId, w) <- Map.toList mapNodeIdWeight
282 _ <- Doc.add masterCorpusId ids'
283 _cooc <- mkNode NodeListCooc lId masterUserId
285 _ <- insertDocNgrams lId indexedNgrams
290 withLang :: HasText a
292 -> [DocumentWithId a]
294 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
297 Nothing -> trace ("buildTries here" :: String)
299 $ buildTries n ( fmap toToken $ uniText
300 $ Text.intercalate " . "
308 ------------------------------------------------------------------------
309 viewUniqId' :: UniqId a
312 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
314 err = panic "[ERROR] Database.Flow.toInsert"
317 toInserted :: [ReturnId]
318 -> Map HashId ReturnId
320 Map.fromList . map (\r -> (reUniqId r, r) )
321 . filter (\r -> reInserted r == True)
323 mergeData :: Map HashId ReturnId
325 -> [DocumentWithId a]
326 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
328 toDocumentWithId (sha,hpd) =
329 DocumentWithId <$> fmap reId (lookup sha rs)
332 ------------------------------------------------------------------------
334 instance HasText HyperdataContact
338 instance ExtractNgramsT HyperdataContact
340 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
342 extract :: TermType Lang -> HyperdataContact
343 -> Cmd err (Map Ngrams (Map NgramsType Int))
345 let authors = map text2ngrams
346 $ maybe ["Nothing"] (\a -> [a])
347 $ view (hc_who . _Just . cw_lastName) hc'
349 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
351 instance HasText HyperdataDocument
353 hasText h = catMaybes [ _hyperdataDocument_title h
354 , _hyperdataDocument_abstract h
357 instance ExtractNgramsT HyperdataDocument
359 extractNgramsT :: TermType Lang
361 -> Cmd err (Map Ngrams (Map NgramsType Int))
362 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
364 extractNgramsT' :: TermType Lang
366 -> Cmd err (Map Ngrams (Map NgramsType Int))
367 extractNgramsT' lang' doc = do
368 let source = text2ngrams
369 $ maybe "Nothing" identity
370 $ _hyperdataDocument_source doc
372 institutes = map text2ngrams
373 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
374 $ _hyperdataDocument_institutes doc
376 authors = map text2ngrams
377 $ maybe ["Nothing"] (splitOn ", ")
378 $ _hyperdataDocument_authors doc
380 terms' <- map text2ngrams
381 <$> map (intercalate " " . _terms_label)
383 <$> liftBase (extractTerms lang' $ hasText doc)
385 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
386 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
387 <> [(a', Map.singleton Authors 1) | a' <- authors ]
388 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
390 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
391 -> Map Ngrams (Map NgramsType Int)
392 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
394 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
396 False -> (Ngrams (Text.take s' t) n , y)
399 documentIdWithNgrams :: HasNodeError err
401 -> Cmd err (Map Ngrams (Map NgramsType Int)))
402 -> [DocumentWithId a]
403 -> Cmd err [DocumentIdWithNgrams a]
404 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
406 toDocumentIdWithNgrams d = do
407 e <- f $ documentData d
408 pure $ DocumentIdWithNgrams d e