2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 -- check userId CanFillUserCorpus userCorpusId
12 -- check masterUserId CanFillMasterCorpus masterCorpusId
14 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
15 -- TODO-EVENTS: InsertedNodes
18 {-# OPTIONS_GHC -fno-warn-orphans #-}
20 {-# LANGUAGE ConstraintKinds #-}
21 {-# LANGUAGE RankNTypes #-}
22 {-# LANGUAGE ConstrainedClassMethods #-}
23 {-# LANGUAGE ConstraintKinds #-}
24 {-# LANGUAGE DeriveGeneric #-}
25 {-# LANGUAGE FlexibleContexts #-}
26 {-# LANGUAGE InstanceSigs #-}
27 {-# LANGUAGE NoImplicitPrelude #-}
28 {-# LANGUAGE OverloadedStrings #-}
29 {-# LANGUAGE TemplateHaskell #-}
31 module Gargantext.Database.Action.Flow -- (flowDatabase, ngrams2list)
41 , getOrMk_RootWithCorpus
47 import Control.Lens ((^.), view, _Just, makeLenses)
48 import Data.Aeson.TH (deriveJSON)
50 import Data.List (concat)
51 import Data.Map (Map, lookup)
52 import Data.Maybe (Maybe(..), catMaybes)
55 import Data.Text (Text, splitOn, intercalate)
56 import Data.Traversable (traverse)
57 import Data.Tuple.Extra (first, second)
58 import Debug.Trace (trace)
59 import Gargantext.Core (Lang(..))
60 import Gargantext.Core.Flow.Types
61 import Gargantext.Core.Types (Terms(..))
62 import Gargantext.Core.Types.Individu (User(..))
63 import Gargantext.Core.Types.Main
64 import Gargantext.Database.Action.Flow.List
65 import Gargantext.Database.Action.Flow.Types
66 import Gargantext.Database.Action.Flow.Utils (insertDocNgrams)
67 import Gargantext.Database.Action.Query.Node
68 import Gargantext.Database.Action.Query.Node.Contact -- (HyperdataContact(..), ContactWho(..))
69 import Gargantext.Database.Action.Query.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
70 import Gargantext.Database.Action.Query.Tree.Root (getOrMkRoot, getOrMk_RootWithCorpus)
71 import Gargantext.Database.Action.Search (searchInDatabase)
72 import Gargantext.Database.Admin.Config (userMaster, corpusMasterName)
73 import Gargantext.Database.Admin.Types.Errors (HasNodeError(..))
74 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
75 import Gargantext.Database.Admin.Utils (Cmd)
76 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
77 import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
78 import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
79 import Gargantext.Ext.IMT (toSchoolName)
80 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
81 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
82 import Gargantext.Prelude
83 import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
84 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
85 import qualified Gargantext.Text.Terms as GTT (TermType(..), tt_lang, extractTerms, uniText)
86 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
87 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
88 import GHC.Generics (Generic)
89 import Prelude (String)
90 import System.FilePath (FilePath)
91 import qualified Data.List as List
92 import qualified Data.Map as Map
93 import qualified Data.Text as Text
94 import qualified Gargantext.Database.Action.Query.Node.Document.Add as Doc (add)
95 import qualified Gargantext.Text.Corpus.API as API
97 ------------------------------------------------------------------------
98 -- TODO use internal with API name (could be old data)
99 data DataOrigin = Internal Gargantext
100 | External API.ExternalAPIs
103 data DataText = DataOld ![NodeId]
104 | DataNew ![[HyperdataDocument]]
107 -- TODO use the split parameter in config file
108 getDataText :: FlowCmdM env err m
114 getDataText (External api) la q li = liftBase $ DataNew
116 <$> API.get api (_tt_lang la) q li
117 getDataText Gargantext la q li = do
118 (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
119 (UserName userMaster)
121 (Nothing :: Maybe HyperdataCorpus)
122 ids <- map fst <$> searchInDatabase cId (stemIt q)
125 -------------------------------------------------------------------------------
128 = Mono { _tt_lang :: lang }
129 | Multi { _tt_lang :: lang }
130 | MonoMulti { _tt_lang :: lang }
131 | Unsupervised { _tt_lang :: lang
132 , _tt_windowSize :: Int
133 , _tt_ngramsSize :: Int
137 -- | GTT.TermType as a complex type in Unsupervised configuration that is not needed
139 tta2tt :: TermType lang -> GTT.TermType lang
140 tta2tt (Mono l) = GTT.Mono l
141 tta2tt (Multi l) = GTT.Multi l
142 tta2tt (MonoMulti l) = GTT.MonoMulti l
143 tta2tt (Unsupervised la w ng) = GTT.Unsupervised la w ng Nothing
145 makeLenses ''TermType
146 deriveJSON (unPrefix "_tt_") ''TermType
148 instance (ToSchema a) => ToSchema (TermType a) where
149 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_tta_")
152 flowDataText :: FlowCmdM env err m
158 flowDataText u (DataOld ids) tt cid = flowCorpusUser (_tt_lang tt) u (Right [cid]) corpusType ids
160 corpusType = (Nothing :: Maybe HyperdataCorpus)
161 flowDataText u (DataNew txt) tt cid = flowCorpus u (Right [cid]) tt txt
163 ------------------------------------------------------------------------
165 flowAnnuaire :: FlowCmdM env err m
167 -> Either CorpusName [CorpusId]
171 flowAnnuaire u n l filePath = do
172 docs <- liftBase $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
173 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
175 ------------------------------------------------------------------------
176 flowCorpusFile :: FlowCmdM env err m
178 -> Either CorpusName [CorpusId]
179 -> Limit -- Limit the number of docs (for dev purpose)
180 -> TermType Lang -> FileFormat -> FilePath
182 flowCorpusFile u n l la ff fp = do
183 docs <- liftBase ( splitEvery 500
187 flowCorpus u n la (map (map toHyperdataDocument) docs)
189 ------------------------------------------------------------------------
190 -- | TODO improve the needed type to create/update a corpus
191 -- (For now, Either is enough)
192 flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
194 -> Either CorpusName [CorpusId]
198 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
201 flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
204 -> Either CorpusName [CorpusId]
208 flow c u cn la docs = do
210 ids <- traverse (insertMasterDocs c la') docs
211 flowCorpusUser (la' ^. GTT.tt_lang) u cn c (concat ids)
213 ------------------------------------------------------------------------
214 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
217 -> Either CorpusName [CorpusId]
221 flowCorpusUser l user corpusName ctype ids = do
223 (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus user corpusName ctype
224 listId <- getOrMkList userCorpusId userId
225 _cooc <- mkNode NodeListCooc listId userId
226 -- TODO: check if present already, ignore
227 _ <- Doc.add userCorpusId ids
229 _tId <- mkNode NodeTexts userCorpusId userId
230 -- printDebug "Node Text Id" tId
233 (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left "") ctype
234 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
235 _userListId <- flowList_DbRepo listId ngs
236 _mastListId <- getOrMkList masterCorpusId masterUserId
237 -- _ <- insertOccsUpdates userCorpusId mastListId
238 -- printDebug "userListId" userListId
240 _ <- mkDashboard userCorpusId userId
241 _ <- mkGraph userCorpusId userId
242 --_ <- mkPhylo userCorpusId userId
245 -- _ <- mkAnnuaire rootUserId userId
249 insertMasterDocs :: ( FlowCmdM env err m
257 insertMasterDocs c lang hs = do
258 (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus (UserName userMaster) (Left corpusMasterName) c
260 -- TODO Type NodeDocumentUnicised
261 let docs = map addUniqId hs
262 ids <- insertDb masterUserId masterCorpusId docs
265 documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
267 -- create a corpus with database name (CSV or PubMed)
268 -- add documents to the corpus (create node_node link)
269 -- this will enable global database monitoring
271 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
272 maps <- mapNodeIdNgrams
273 <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
275 terms2id <- insertNgrams $ Map.keys maps
277 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
280 lId <- getOrMkList masterCorpusId masterUserId
281 mapCgramsId <- listInsertDb lId toNodeNgramsW'
282 $ map (first _ngramsTerms . second Map.keys)
285 _return <- insertNodeNodeNgrams2
286 $ catMaybes [ NodeNodeNgrams2 <$> Just nId
287 <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
288 <*> Just (fromIntegral w :: Double)
289 | (terms, mapNgramsTypes) <- Map.toList maps
290 , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
291 , (nId, w) <- Map.toList mapNodeIdWeight
294 _ <- Doc.add masterCorpusId ids'
295 _cooc <- mkNode NodeListCooc lId masterUserId
297 _ <- insertDocNgrams lId indexedNgrams
302 withLang :: HasText a
304 -> [DocumentWithId a]
306 withLang (GTT.Unsupervised l n s m) ns = GTT.Unsupervised l n s m'
309 Nothing -> trace ("buildTries here" :: String)
311 $ buildTries n ( fmap toToken $ GTT.uniText
312 $ Text.intercalate " . "
320 ------------------------------------------------------------------------
321 viewUniqId' :: UniqId a
324 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
326 err = panic "[ERROR] Database.Flow.toInsert"
329 toInserted :: [ReturnId]
330 -> Map HashId ReturnId
332 Map.fromList . map (\r -> (reUniqId r, r) )
333 . filter (\r -> reInserted r == True)
335 mergeData :: Map HashId ReturnId
337 -> [DocumentWithId a]
338 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
340 toDocumentWithId (sha,hpd) =
341 DocumentWithId <$> fmap reId (lookup sha rs)
344 ------------------------------------------------------------------------
346 instance HasText HyperdataContact
350 instance ExtractNgramsT HyperdataContact
352 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
354 extract :: GTT.TermType Lang -> HyperdataContact
355 -> Cmd err (Map Ngrams (Map NgramsType Int))
357 let authors = map text2ngrams
358 $ maybe ["Nothing"] (\a -> [a])
359 $ view (hc_who . _Just . cw_lastName) hc'
361 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
363 instance HasText HyperdataDocument
365 hasText h = catMaybes [ _hyperdataDocument_title h
366 , _hyperdataDocument_abstract h
369 instance ExtractNgramsT HyperdataDocument
371 extractNgramsT :: GTT.TermType Lang
373 -> Cmd err (Map Ngrams (Map NgramsType Int))
374 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
376 extractNgramsT' :: GTT.TermType Lang
378 -> Cmd err (Map Ngrams (Map NgramsType Int))
379 extractNgramsT' lang' doc = do
380 let source = text2ngrams
381 $ maybe "Nothing" identity
382 $ _hyperdataDocument_source doc
384 institutes = map text2ngrams
385 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
386 $ _hyperdataDocument_institutes doc
388 authors = map text2ngrams
389 $ maybe ["Nothing"] (splitOn ", ")
390 $ _hyperdataDocument_authors doc
392 terms' <- map text2ngrams
393 <$> map (intercalate " " . _terms_label)
395 <$> liftBase (GTT.extractTerms lang' $ hasText doc)
397 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
398 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
399 <> [(a', Map.singleton Authors 1) | a' <- authors ]
400 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
402 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
403 -> Map Ngrams (Map NgramsType Int)
404 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
406 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
408 False -> (Ngrams (Text.take s' t) n , y)
411 documentIdWithNgrams :: HasNodeError err
413 -> Cmd err (Map Ngrams (Map NgramsType Int)))
414 -> [DocumentWithId a]
415 -> Cmd err [DocumentIdWithNgrams a]
416 documentIdWithNgrams f = traverse toDocumentIdWithNgrams
418 toDocumentIdWithNgrams d = do
419 e <- f $ documentData d
420 pure $ DocumentIdWithNgrams d e