2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 -- check userId CanFillUserCorpus userCorpusId
13 -- check masterUserId CanFillMasterCorpus masterCorpusId
15 -- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
16 -- TODO-EVENTS: InsertedNodes
21 {-# LANGUAGE ConstraintKinds #-}
22 {-# LANGUAGE DeriveGeneric #-}
23 {-# LANGUAGE FlexibleContexts #-}
24 {-# LANGUAGE InstanceSigs #-}
25 {-# LANGUAGE NoImplicitPrelude #-}
26 {-# LANGUAGE OverloadedStrings #-}
27 {-# LANGUAGE RankNTypes #-}
28 {-# LANGUAGE ConstrainedClassMethods #-}
30 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
33 --import Debug.Trace (trace)
34 import Control.Lens ((^.), view, Lens', _Just)
35 import Control.Monad (mapM_)
36 import Control.Monad.IO.Class (liftIO)
37 import Data.List (concat)
38 import Data.Map (Map, lookup, toList)
39 import Data.Maybe (Maybe(..), catMaybes)
41 import Data.Text (Text, splitOn, intercalate)
42 import GHC.Show (Show)
43 import Gargantext.API.Ngrams (HasRepoVar)
44 import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
45 import Gargantext.Core (Lang(..))
46 import Gargantext.Core.Types (NodePoly(..), Terms(..))
47 import Gargantext.Core.Types.Individu (Username)
48 import Gargantext.Core.Types.Main
49 import Gargantext.Database.Config (userMaster, corpusMasterName)
50 import Gargantext.Database.Flow.Utils (insertDocNgrams)
51 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
52 import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
53 import Gargantext.Database.Root (getRoot)
54 import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
55 import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
56 import Gargantext.Database.Schema.User (getUser, UserLight(..))
57 import Gargantext.Database.TextSearch (searchInDatabase)
58 import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
59 import Gargantext.Database.Utils (Cmd, CmdM)
60 import Gargantext.Ext.IMT (toSchoolName)
61 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
62 import Gargantext.Prelude
63 import Gargantext.Text.Terms.Eleve (buildTries, toToken)
64 import Gargantext.Text.List (buildNgramsLists,StopSize(..))
65 import Gargantext.Text.Parsers (parseFile, FileFormat)
66 import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
67 import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
68 import Servant (ServantErr)
69 import System.FilePath (FilePath)
70 import qualified Data.List as List
71 import qualified Data.Map as Map
72 import qualified Data.Text as Text
73 import qualified Gargantext.Database.Node.Document.Add as Doc (add)
74 import qualified Gargantext.Text.Parsers.GrandDebat as GD
76 type FlowCmdM env err m =
83 type FlowCorpus a = ( AddUniqId a
90 ------------------------------------------------------------------------
92 flowAnnuaire :: FlowCmdM env ServantErr m
93 => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
94 flowAnnuaire u n l filePath = do
95 docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
96 flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
99 flowCorpusDebat :: FlowCmdM env ServantErr m
100 => Username -> CorpusName
103 flowCorpusDebat u n l fp = do
104 docs <- liftIO ( splitEvery 500
107 :: IO [[GD.GrandDebatReference ]]
109 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
112 flowCorpusFile :: FlowCmdM env ServantErr m
113 => Username -> CorpusName
114 -> Limit -- Limit the number of docs (for dev purpose)
115 -> TermType Lang -> FileFormat -> FilePath
117 flowCorpusFile u n l la ff fp = do
118 docs <- liftIO ( splitEvery 500
122 flowCorpus u n la (map (map toHyperdataDocument) docs)
124 -- TODO query with complex query
125 flowCorpusSearchInDatabase :: FlowCmdM env err m
126 => Username -> Lang -> Text -> m CorpusId
127 flowCorpusSearchInDatabase u la q = do
128 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
129 ids <- map fst <$> searchInDatabase cId (stemIt q)
130 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
133 flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
134 => Username -> Lang -> Text -> m CorpusId
135 flowCorpusSearchInDatabase' u la q = do
136 (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
137 ids <- map fst <$> searchInDatabase cId (stemIt q)
138 flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
140 ------------------------------------------------------------------------
142 flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
143 => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
144 flow c u cn la docs = do
145 ids <- mapM (insertMasterDocs c la ) docs
146 flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
148 flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
149 => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
150 flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
153 flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
154 => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
155 flowCorpusUser l userName corpusName ctype ids = do
157 (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
158 -- TODO: check if present already, ignore
159 _ <- Doc.add userCorpusId ids
163 (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
164 ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
165 userListId <- flowList userId userCorpusId ngs
166 printDebug "userListId" userListId
168 _ <- mkGraph userCorpusId userId
171 -- User Dashboard Flow
172 _ <- mkDashboard userCorpusId userId
175 -- _ <- mkAnnuaire rootUserId userId
179 insertMasterDocs :: ( FlowCmdM env ServantErr m
183 => Maybe c -> TermType Lang -> [a] -> m [DocId]
184 insertMasterDocs c lang hs = do
185 (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
187 -- TODO Type NodeDocumentUnicised
188 let hs' = map addUniqId hs
189 ids <- insertDb masterUserId masterCorpusId hs'
190 let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
193 fixLang (Unsupervised l n s m) = Unsupervised l n s m'
196 Nothing -> Just $ buildTries n (fmap toToken $ uniText $ Text.intercalate " " $ List.concat $ map hasText documentsWithId)
201 -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
202 maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
203 terms2id <- insertNgrams $ Map.keys maps
204 let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
206 lId <- getOrMkList masterCorpusId masterUserId
207 _ <- insertDocNgrams lId indexedNgrams
212 type CorpusName = Text
214 getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
215 => Username -> CorpusName -> Maybe a
216 -> Cmd err (UserId, RootId, CorpusId)
217 getOrMkRootWithCorpus username cName c = do
218 maybeUserId <- getUser username
219 userId <- case maybeUserId of
220 Nothing -> nodeError NoUserFound
221 Just user -> pure $ userLight_id user
223 rootId' <- map _node_id <$> getRoot username
225 rootId'' <- case rootId' of
226 [] -> mkRoot username userId
227 n -> case length n >= 2 of
228 True -> nodeError ManyNodeUsers
229 False -> pure rootId'
231 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
233 corpusId'' <- if username == userMaster
235 ns <- getCorporaWithParentId rootId
236 pure $ map _node_id ns
240 corpusId' <- if corpusId'' /= []
242 else mk (Just cName) c rootId userId
244 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
246 pure (userId, rootId, corpusId)
249 ------------------------------------------------------------------------
254 uniqId :: Lens' a (Maybe HashId)
257 instance UniqId HyperdataDocument
259 uniqId = hyperdataDocument_uniqId
261 instance UniqId HyperdataContact
265 viewUniqId' :: UniqId a => a -> (HashId, a)
266 viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
268 err = panic "[ERROR] Database.Flow.toInsert"
271 toInserted :: [ReturnId] -> Map HashId ReturnId
272 toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
273 . filter (\r -> reInserted r == True)
275 data DocumentWithId a = DocumentWithId
276 { documentId :: !NodeId
280 instance HasText a => HasText (DocumentWithId a)
282 hasText (DocumentWithId _ a) = hasText a
284 mergeData :: Map HashId ReturnId
286 -> [DocumentWithId a]
287 mergeData rs = catMaybes . map toDocumentWithId . Map.toList
289 toDocumentWithId (hash,hpd) =
290 DocumentWithId <$> fmap reId (lookup hash rs)
293 ------------------------------------------------------------------------
294 data DocumentIdWithNgrams a = DocumentIdWithNgrams
295 { documentWithId :: !(DocumentWithId a)
296 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
300 class ExtractNgramsT h
302 extractNgramsT :: HasText h => TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
306 hasText :: h -> [Text]
308 instance HasText HyperdataContact
312 instance ExtractNgramsT HyperdataContact
314 extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
316 extract :: TermType Lang -> HyperdataContact
317 -> Cmd err (Map Ngrams (Map NgramsType Int))
319 let authors = map text2ngrams
320 $ maybe ["Nothing"] (\a -> [a])
321 $ view (hc_who . _Just . cw_lastName) hc'
323 pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
325 instance HasText HyperdataDocument
327 hasText h = catMaybes [ _hyperdataDocument_title h
328 , _hyperdataDocument_abstract h
331 instance ExtractNgramsT HyperdataDocument
333 extractNgramsT :: TermType Lang -> HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
334 extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
336 extractNgramsT' :: TermType Lang -> HyperdataDocument
337 -> Cmd err (Map Ngrams (Map NgramsType Int))
338 extractNgramsT' lang' doc = do
339 let source = text2ngrams
340 $ maybe "Nothing" identity
341 $ _hyperdataDocument_source doc
343 institutes = map text2ngrams
344 $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
345 $ _hyperdataDocument_institutes doc
347 authors = map text2ngrams
348 $ maybe ["Nothing"] (splitOn ", ")
349 $ _hyperdataDocument_authors doc
351 terms' <- map text2ngrams
352 <$> map (intercalate " " . _terms_label)
354 <$> liftIO (extractTerms lang' $ hasText doc)
356 pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
357 <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
358 <> [(a', Map.singleton Authors 1) | a' <- authors ]
359 <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
362 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
363 -> Map Ngrams (Map NgramsType Int)
364 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
366 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
368 False -> (Ngrams (Text.take s' t) n , y)
371 documentIdWithNgrams :: HasNodeError err
373 -> Cmd err (Map Ngrams (Map NgramsType Int)))
374 -> [DocumentWithId a]
375 -> Cmd err [DocumentIdWithNgrams a]
376 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
378 toDocumentIdWithNgrams d = do
379 e <- f $ documentData d
380 pure $ DocumentIdWithNgrams d e
384 -- | TODO check optimization
385 mapNodeIdNgrams :: [DocumentIdWithNgrams a]
386 -> Map Ngrams (Map NgramsType (Map NodeId Int))
387 mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
389 f :: DocumentIdWithNgrams a
390 -> Map Ngrams (Map NgramsType (Map NodeId Int))
391 f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
393 nId = documentId $ documentWithId d
395 ------------------------------------------------------------------------
396 listInsert :: FlowCmdM env err m
397 => ListId -> Map NgramsType [NgramsElement]
399 listInsert lId ngs = mapM_ (\(typeList, ngElmts)
400 -> putListNgrams lId typeList ngElmts
403 flowList :: FlowCmdM env err m => UserId -> CorpusId
404 -> Map NgramsType [NgramsElement]
406 flowList uId cId ngs = do
407 lId <- getOrMkList cId uId
408 printDebug "listId flowList" lId
410 --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs