Stability : experimental
Portability : POSIX
+-- TODO-ACCESS:
+-- check userId CanFillUserCorpus userCorpusId
+-- check masterUserId CanFillMasterCorpus masterCorpusId
+
+-- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
+-- TODO-EVENTS: InsertedNodes
+-}
+
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE ConstrainedClassMethods #-}
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE InstanceSigs #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+
+module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
+ ( FlowCmdM
+ , flowCorpusFile
+ , flowCorpus
+ , flowCorpusSearchInDatabase
+ , getOrMkRoot
+ , getOrMk_RootWithCorpus
+ , flowAnnuaire
+ )
+ where
+
+import Prelude (String)
+import Data.Either
+import Data.Tuple.Extra (first, second)
+import Data.Traversable (traverse)
+import Debug.Trace (trace)
+import Control.Lens ((^.), view, _Just)
+import Control.Monad.IO.Class (liftIO)
+import Data.List (concat)
+import Data.Map (Map, lookup)
+import Data.Maybe (Maybe(..), catMaybes)
+import Data.Monoid
+import Data.Text (Text, splitOn, intercalate)
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Types (NodePoly(..), Terms(..))
+import Gargantext.Core.Types.Individu (Username)
+import Gargantext.Core.Flow.Types
+import Gargantext.Core.Types.Main
+import Gargantext.Database.Config (userMaster, corpusMasterName)
+import Gargantext.Database.Flow.Utils (insertDocNgrams)
+import Gargantext.Database.Flow.List
+import Gargantext.Database.Flow.Types
+import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
+import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
+import Gargantext.Database.Root (getRoot)
+
+import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
+import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
+import Gargantext.Database.Schema.NodeNgrams (listInsertDb , getCgramsId)
+import Gargantext.Database.Schema.NodeNodeNgrams2 -- (NodeNodeNgrams2, insertNodeNodeNgrams2)
+import Gargantext.Database.Schema.User (getUser, UserLight(..))
+import Gargantext.Database.TextSearch (searchInDatabase)
+import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
+import Gargantext.Database.Utils (Cmd)
+import Gargantext.Ext.IMT (toSchoolName)
+import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
+import Gargantext.Prelude
+import Gargantext.Text.Terms.Eleve (buildTries, toToken)
+import Gargantext.Text.List (buildNgramsLists,StopSize(..))
+import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
+import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
+import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
+import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
+import Gargantext.Prelude.Utils hiding (sha)
+import System.FilePath (FilePath)
+import qualified Data.List as List
+import qualified Data.Map as Map
+import qualified Data.Text as Text
+import qualified Gargantext.Database.Node.Document.Add as Doc (add)
+import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
+
+------------------------------------------------------------------------
+
+data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
+-- | APIs
+-- TODO instances
+getDataApi :: Lang
+ -> Maybe Limit
+ -> ApiQuery
+ -> IO [HyperdataDocument]
+getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
+getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
-add :: Corpus -> [Documents] -> IO Int
-if new id -> extractNgrams + extract Authors + extract Sources
-Map (Ngrams, NodeId)
-insert Ngrams -> NgramsId
-Map (NgramsId, NodeId) -> insert
+-- UNUSED
+_flowCorpusApi :: ( FlowCmdM env err m)
+ => Username -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> Maybe Limit
+ -> ApiQuery
+ -> m CorpusId
+_flowCorpusApi u n tt l q = do
+ docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
+ flowCorpus u n tt docs
-data NgramsType = Sources | Authors | Terms
+------------------------------------------------------------------------
-nodes_ngrams : column type, column list
+flowAnnuaire :: FlowCmdM env err m
+ => Username
+ -> Either CorpusName [CorpusId]
+ -> (TermType Lang)
+ -> FilePath
+ -> m AnnuaireId
+flowAnnuaire u n l filePath = do
+ docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
+ flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
+-- UNUSED
+_flowCorpusDebat :: FlowCmdM env err m
+ => Username -> Either CorpusName [CorpusId]
+ -> Limit -> FilePath
+ -> m CorpusId
+_flowCorpusDebat u n l fp = do
+ docs <- liftIO ( splitEvery 500
+ <$> take l
+ <$> readFile' fp
+ :: IO [[GD.GrandDebatReference ]]
+ )
+ flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
-documents
-sources
-authors
+flowCorpusFile :: FlowCmdM env err m
+ => Username -> Either CorpusName [CorpusId]
+ -> Limit -- Limit the number of docs (for dev purpose)
+ -> TermType Lang -> FileFormat -> FilePath
+ -> m CorpusId
+flowCorpusFile u n l la ff fp = do
+ docs <- liftIO ( splitEvery 500
+ <$> take l
+ <$> parseFile ff fp
+ )
+ flowCorpus u n la (map (map toHyperdataDocument) docs)
+-- TODO query with complex query
+flowCorpusSearchInDatabase :: FlowCmdM env err m
+ => Username
+ -> Lang
+ -> Text
+ -> m CorpusId
+flowCorpusSearchInDatabase u la q = do
+ (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
+ userMaster
+ (Left "")
+ (Nothing :: Maybe HyperdataCorpus)
+ ids <- map fst <$> searchInDatabase cId (stemIt q)
+ flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
+
+
+-- UNUSED
+_flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
+ => Username
+ -> Lang
+ -> Text
+ -> m CorpusId
+_flowCorpusSearchInDatabaseApi u la q = do
+ (_masterUserId, _masterRootId, cId) <- getOrMk_RootWithCorpus
+ userMaster
+ (Left "")
+ (Nothing :: Maybe HyperdataCorpus)
+ ids <- map fst <$> searchInDatabase cId (stemIt q)
+ flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
+
+------------------------------------------------------------------------
+-- | TODO improve the needed type to create/update a corpus
+{- UNUSED
+data UserInfo = Username Text
+ | UserId NodeId
+data CorpusInfo = CorpusName Lang Text
+ | CorpusId Lang NodeId
-}
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
+flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
+ => Maybe c
+ -> Username
+ -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> [[a]]
+ -> m CorpusId
+flow c u cn la docs = do
+ ids <- traverse (insertMasterDocs c la ) docs
+ flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
+
+flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
+ => Username
+ -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> [[a]]
+ -> m CorpusId
+flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
+
+------------------------------------------------------------------------
+flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
+ => Lang
+ -> Username
+ -> Either CorpusName [CorpusId]
+ -> Maybe c
+ -> [NodeId]
+ -> m CorpusId
+flowCorpusUser l userName corpusName ctype ids = do
+ -- User Flow
+ (userId, _rootId, userCorpusId) <- getOrMk_RootWithCorpus userName corpusName ctype
+ listId <- getOrMkList userCorpusId userId
+ _cooc <- mkNode NodeListCooc listId userId
+ -- TODO: check if present already, ignore
+ _ <- Doc.add userCorpusId ids
+
+ _tId <- mkNode NodeTexts userCorpusId userId
+ -- printDebug "Node Text Id" tId
+
+ -- User List Flow
+ (masterUserId, _masterRootId, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left "") ctype
+ ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
+ _userListId <- flowList_DbRepo listId ngs
+ _mastListId <- getOrMkList masterCorpusId masterUserId
+ -- _ <- insertOccsUpdates userCorpusId mastListId
+ -- printDebug "userListId" userListId
+ -- User Graph Flow
+ _ <- mkDashboard userCorpusId userId
+ _ <- mkGraph userCorpusId userId
+ --_ <- mkPhylo userCorpusId userId
+
+ -- Annuaire Flow
+ -- _ <- mkAnnuaire rootUserId userId
+ pure userCorpusId
+
+
+insertMasterDocs :: ( FlowCmdM env err m
+ , FlowCorpus a
+ , MkCorpus c
+ )
+ => Maybe c
+ -> TermType Lang
+ -> [a]
+ -> m [DocId]
+insertMasterDocs c lang hs = do
+ (masterUserId, _, masterCorpusId) <- getOrMk_RootWithCorpus userMaster (Left corpusMasterName) c
+
+ -- TODO Type NodeDocumentUnicised
+ let docs = map addUniqId hs
+ ids <- insertDb masterUserId masterCorpusId docs
+ let
+ ids' = map reId ids
+ documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' docs)
+ -- TODO
+ -- create a corpus with database name (CSV or PubMed)
+ -- add documents to the corpus (create node_node link)
+ -- this will enable global database monitoring
+
+ -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
+ maps <- mapNodeIdNgrams
+ <$> documentIdWithNgrams (extractNgramsT $ withLang lang documentsWithId) documentsWithId
+
+ terms2id <- insertNgrams $ Map.keys maps
+ -- to be removed
+ let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
+
+ -- new
+ lId <- getOrMkList masterCorpusId masterUserId
+ mapCgramsId <- listInsertDb lId toNodeNgramsW'
+ $ map (first _ngramsTerms . second Map.keys)
+ $ Map.toList maps
+ -- insertDocNgrams
+ _return <- insertNodeNodeNgrams2
+ $ catMaybes [ NodeNodeNgrams2 <$> Just nId
+ <*> getCgramsId mapCgramsId ngrams_type (_ngramsTerms terms)
+ <*> Just (fromIntegral w :: Double)
+ | (terms, mapNgramsTypes) <- Map.toList maps
+ , (ngrams_type, mapNodeIdWeight) <- Map.toList mapNgramsTypes
+ , (nId, w) <- Map.toList mapNodeIdWeight
+ ]
+
+ _ <- Doc.add masterCorpusId ids'
+ _cooc <- mkNode NodeListCooc lId masterUserId
+ -- to be removed
+ _ <- insertDocNgrams lId indexedNgrams
+
+ pure ids'
+
+
+withLang :: HasText a => TermType Lang
+ -> [DocumentWithId a]
+ -> TermType Lang
+withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
+ where
+ m' = case m of
+ Nothing -> trace ("buildTries here" :: String)
+ $ Just
+ $ buildTries n ( fmap toToken $ uniText
+ $ Text.intercalate " . "
+ $ List.concat
+ $ map hasText ns
+ )
+ just_m -> just_m
+withLang l _ = l
+
+
+
+type CorpusName = Text
+
+
+getOrMkRoot :: (HasNodeError err)
+ => Username
+ -> Cmd err (UserId, RootId)
+getOrMkRoot username = do
+ maybeUserId <- getUser username
+ userId <- case maybeUserId of
+ Nothing -> nodeError NoUserFound
+ Just user -> pure $ userLight_id user
+
+ rootId' <- map _node_id <$> getRoot username
-module Gargantext.Database.Flow
- where
-import System.FilePath (FilePath)
-import GHC.Base ((>>))
-import Data.Maybe (Maybe(..))
-import Gargantext.Core.Types (NodePoly(..))
-import Gargantext.Prelude
-import Gargantext.Database.Bashql (runCmd', del)
-import Gargantext.Database.Node (Cmd(..), getRoot, mkRoot, mkCorpus, defaultCorpus)
-import Gargantext.Database.User (getUser, UserLight(..), Username)
-import Gargantext.Database.Node.Document.Import (insertDocuments)
-import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
-
-type UserId = Int
-type RootId = Int
-type CorpusId = Int
-
-subFlow :: Username -> IO (UserId, RootId, CorpusId)
-subFlow username = do
- maybeUserId <- runCmd' (getUser username)
-
- let userId = case maybeUserId of
- Nothing -> panic "Error: User does not exist (yet)" -- mk NodeUser gargantua_id "Node Gargantua"
- Just user -> userLight_id user
-
- rootId' <- map _node_id <$> runCmd' (getRoot userId)
-
rootId'' <- case rootId' of
- [] -> runCmd' (mkRoot userId)
- un -> case length un >= 2 of
- True -> panic "Error: more than 1 userNode / user"
- False -> pure rootId'
- let rootId = maybe (panic "error rootId") identity (head rootId'')
-
- corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
- let corpusId = maybe (panic "error corpusId") identity (head corpusId')
-
- printDebug "(username, userId, rootId, corpusId"
- (username, userId, rootId, corpusId)
+ [] -> mkRoot username userId
+ n -> case length n >= 2 of
+ True -> nodeError ManyNodeUsers
+ False -> pure rootId'
+
+ rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
+ pure (userId, rootId)
+
+
+getOrMk_RootWithCorpus :: (HasNodeError err, MkCorpus a)
+ => Username
+ -> Either CorpusName [CorpusId]
+ -> Maybe a
+ -> Cmd err (UserId, RootId, CorpusId)
+getOrMk_RootWithCorpus username cName c = do
+ (userId, rootId) <- getOrMkRoot username
+ corpusId'' <- if username == userMaster
+ then do
+ ns <- getCorporaWithParentId rootId
+ pure $ map _node_id ns
+ else
+ pure $ fromRight [] cName
+
+ corpusId' <- if corpusId'' /= []
+ then pure corpusId''
+ else do
+ c' <- mk (Just $ fromLeft "Default" cName) c rootId userId
+ _tId <- case head c' of
+ Nothing -> pure [0]
+ Just c'' -> mkNode NodeTexts c'' userId
+ pure c'
+
+ corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
pure (userId, rootId, corpusId)
--- flow :: FilePath -> IO ()
-flow fp = do
+------------------------------------------------------------------------
+viewUniqId' :: UniqId a
+ => a
+ -> (HashId, a)
+viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
+ where
+ err = panic "[ERROR] Database.Flow.toInsert"
- (masterUserId, _, corpusId) <- subFlow "gargantua"
+toInserted :: [ReturnId]
+ -> Map HashId ReturnId
+toInserted =
+ Map.fromList . map (\r -> (reUniqId r, r) )
+ . filter (\r -> reInserted r == True)
- docs <- parseDocs WOS fp
- ids <- runCmd' $ insertDocuments masterUserId corpusId docs
- printDebug "Docs IDs : " ids
+mergeData :: Map HashId ReturnId
+ -> Map HashId a
+ -> [DocumentWithId a]
+mergeData rs = catMaybes . map toDocumentWithId . Map.toList
+ where
+ toDocumentWithId (sha,hpd) =
+ DocumentWithId <$> fmap reId (lookup sha rs)
+ <*> Just hpd
- idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs
- printDebug "Docs IDs : " idsRepeat
-
- (userId, rootId, corpusId2) <- subFlow "alexandre"
+------------------------------------------------------------------------
- runCmd' (del [corpusId])
+instance HasText HyperdataContact
+ where
+ hasText = undefined
+instance ExtractNgramsT HyperdataContact
+ where
+ extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
+ where
+ extract :: TermType Lang -> HyperdataContact
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extract _l hc' = do
+ let authors = map text2ngrams
+ $ maybe ["Nothing"] (\a -> [a])
+ $ view (hc_who . _Just . cw_lastName) hc'
+ pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
+instance HasText HyperdataDocument
+ where
+ hasText h = catMaybes [ _hyperdataDocument_title h
+ , _hyperdataDocument_abstract h
+ ]
-{-
- --folderId <- mk Folder parentId (Name "Data") (Descr "All corpora DATA here")
- folderId <- mk Folder rootId "Data"
- corpusId <- mk Corpus folderId (Name "WOS") (Descr "WOS database description")
--}
+instance ExtractNgramsT HyperdataDocument
+ where
+ extractNgramsT :: TermType Lang
+ -> HyperdataDocument
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
+ where
+ extractNgramsT' :: TermType Lang
+ -> HyperdataDocument
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extractNgramsT' lang' doc = do
+ let source = text2ngrams
+ $ maybe "Nothing" identity
+ $ _hyperdataDocument_source doc
+ institutes = map text2ngrams
+ $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
+ $ _hyperdataDocument_institutes doc
+ authors = map text2ngrams
+ $ maybe ["Nothing"] (splitOn ", ")
+ $ _hyperdataDocument_authors doc
-{-
- docs <- parseDocuments WOS "doc/.."
- ids <- add (Documents corpusId) docs
+ terms' <- map text2ngrams
+ <$> map (intercalate " " . _terms_label)
+ <$> concat
+ <$> liftIO (extractTerms lang' $ hasText doc)
+
+ pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
+ <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
+ <> [(a', Map.singleton Authors 1) | a' <- authors ]
+ <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
+
+filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
+ -> Map Ngrams (Map NgramsType Int)
+filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
+ where
+ filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
+ True -> (ng,y)
+ False -> (Ngrams (Text.take s' t) n , y)
- user_id <- runCmd' (get RootUser "alexandre")
- rootUser_id <- runCmd' (getRootUser $ userLight_id user_id
- corpusId <- mk Corpus
--}
+documentIdWithNgrams :: HasNodeError err
+ => (a
+ -> Cmd err (Map Ngrams (Map NgramsType Int)))
+ -> [DocumentWithId a]
+ -> Cmd err [DocumentIdWithNgrams a]
+documentIdWithNgrams f = traverse toDocumentIdWithNgrams
+ where
+ toDocumentIdWithNgrams d = do
+ e <- f $ documentData d
+ pure $ DocumentIdWithNgrams d e