Portability : POSIX
-Map (NgramsId, NodeId) -> insert
-data NgramsType = Sources | Authors | Terms
-nodes_ngrams : column type, column list
+-- TODO-ACCESS:
+-- check userId CanFillUserCorpus userCorpusId
+-- check masterUserId CanFillMasterCorpus masterCorpusId
+
+-- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
+-- TODO-EVENTS: InsertedNodes
-documents
-sources
-authors
-}
+{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE FlexibleContexts #-}
-module Gargantext.Database.Flow
+module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
where
-import System.FilePath (FilePath)
-import Data.Maybe (Maybe(..))
-import Data.Text (Text, unpack)
-import Data.Map (Map)
-import qualified Data.Map as DM
-import GHC.Generics (Generic)
-import Gargantext.Core.Types (NodePoly(..))
+--import Debug.Trace (trace)
+import Control.Lens ((^.), view, Lens', _Just)
+import Control.Monad (mapM_)
+import Control.Monad.IO.Class (liftIO)
+import Data.List (concat)
+import Data.Map (Map, lookup, toList)
+import Data.Maybe (Maybe(..), catMaybes)
+import Data.Monoid
+import Data.Text (Text, splitOn, intercalate)
+import GHC.Show (Show)
+import Gargantext.API.Ngrams (HasRepoVar)
+import Gargantext.API.Ngrams (NgramsElement(..), putListNgrams, RepoCmdM)
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Types (NodePoly(..), Terms(..))
+import Gargantext.Core.Types.Individu (Username)
+import Gargantext.Core.Types.Main
+import Gargantext.Database.Config (userMaster, corpusMasterName)
+import Gargantext.Database.Flow.Utils (insertDocNgrams)
+import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
+import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
+import Gargantext.Database.Root (getRoot)
+import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
+import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
+import Gargantext.Database.Schema.User (getUser, UserLight(..))
+import Gargantext.Database.TextSearch (searchInDatabase)
+import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
+import Gargantext.Database.Utils (Cmd, CmdM)
+import Gargantext.Ext.IMT (toSchoolName)
+import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude
-import Gargantext.Database.Bashql (runCmd', del)
-import Gargantext.Database.Types.Node (Node(..), HyperdataDocument(..))
-import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..))
-import Gargantext.Database.User (getUser, UserLight(..), Username)
-import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
-import Gargantext.Database.Node.Document.Add (add)
-import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
-import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
-import Gargantext.Database.Ngram (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId)
-
-type UserId = Int
-type RootId = Int
-type CorpusId = Int
-
-subFlow :: Username -> IO (UserId, RootId, CorpusId)
-subFlow username = do
- maybeUserId <- runCmd' (getUser username)
-
- let userId = case maybeUserId of
- Nothing -> panic "Error: User does not exist (yet)"
- -- mk NodeUser gargantua_id "Node Gargantua"
- Just user -> userLight_id user
-
- rootId' <- map _node_id <$> runCmd' (getRoot userId)
+import Gargantext.Text.List (buildNgramsLists,StopSize(..))
+import Gargantext.Text.Parsers (parseFile, FileFormat)
+import Gargantext.Text.Terms (TermType(..), tt_lang)
+import Gargantext.Text.Terms (extractTerms)
+import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
+import Servant (ServantErr)
+import System.FilePath (FilePath)
+--import qualified Data.List as List
+import qualified Data.Map as Map
+import qualified Data.Text as Text
+import qualified Gargantext.Database.Node.Document.Add as Doc (add)
+import qualified Gargantext.Text.Parsers.GrandDebat as GD
+
+type FlowCmdM env err m =
+ ( CmdM env err m
+ , RepoCmdM env err m
+ , HasNodeError err
+ , HasRepoVar env
+ )
+
+type FlowCorpus a = ( AddUniqId a
+ , UniqId a
+ , InsertDb a
+ , ExtractNgramsT a
+ )
+
+------------------------------------------------------------------------
+
+flowAnnuaire :: FlowCmdM env ServantErr m
+ => Username -> CorpusName -> (TermType Lang) -> FilePath -> m AnnuaireId
+flowAnnuaire u n l filePath = do
+ docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
+ flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
+
+
+flowCorpusDebat :: FlowCmdM env ServantErr m
+ => Username -> CorpusName
+ -> Limit -> FilePath
+ -> m CorpusId
+flowCorpusDebat u n l fp = do
+ docs <- liftIO ( splitEvery 500
+ <$> take l
+ <$> GD.readFile fp
+ :: IO [[GD.GrandDebatReference ]]
+ )
+ flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
+
+
+flowCorpusFile :: FlowCmdM env ServantErr m
+ => Username -> CorpusName
+ -> Limit -- Limit the number of docs (for dev purpose)
+ -> TermType Lang -> FileFormat -> FilePath
+ -> m CorpusId
+flowCorpusFile u n l la ff fp = do
+ docs <- liftIO ( splitEvery 500
+ <$> take l
+ <$> parseFile ff fp
+ )
+ flowCorpus u n la (map (map toHyperdataDocument) docs)
+
+-- TODO query with complex query
+flowCorpusSearchInDatabase :: FlowCmdM env err m
+ => Username -> Lang -> Text -> m CorpusId
+flowCorpusSearchInDatabase u la q = do
+ (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
+ ids <- map fst <$> searchInDatabase cId (stemIt q)
+ flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
+
+
+flowCorpusSearchInDatabase' :: FlowCmdM env ServantErr m
+ => Username -> Lang -> Text -> m CorpusId
+flowCorpusSearchInDatabase' u la q = do
+ (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus userMaster "" (Nothing :: Maybe HyperdataCorpus)
+ ids <- map fst <$> searchInDatabase cId (stemIt q)
+ flowCorpusUser la u q (Nothing :: Maybe HyperdataCorpus) ids
+
+
+------------------------------------------------------------------------
+
+flow :: (FlowCmdM env ServantErr m, FlowCorpus a, MkCorpus c)
+ => Maybe c -> Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
+flow c u cn la docs = do
+ ids <- mapM (insertMasterDocs c la ) docs
+ flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
+
+flowCorpus :: (FlowCmdM env ServantErr m, FlowCorpus a)
+ => Username -> CorpusName -> TermType Lang -> [[a]] -> m CorpusId
+flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
+
+
+flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
+ => Lang -> Username -> CorpusName -> Maybe c -> [NodeId] -> m CorpusId
+flowCorpusUser l userName corpusName ctype ids = do
+ -- User Flow
+ (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
+ -- TODO: check if present already, ignore
+ _ <- Doc.add userCorpusId ids
+
+ -- User List Flow
+ --{-
+ (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster "" ctype
+ ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
+ userListId <- flowList userId userCorpusId ngs
+ printDebug "userListId" userListId
+ -- User Graph Flow
+ _ <- mkGraph userCorpusId userId
+ --}
+
+ -- User Dashboard Flow
+ _ <- mkDashboard userCorpusId userId
+
+ -- Annuaire Flow
+ -- _ <- mkAnnuaire rootUserId userId
+ pure userCorpusId
+
+
+insertMasterDocs :: ( FlowCmdM env ServantErr m
+ , FlowCorpus a
+ , MkCorpus c
+ )
+ => Maybe c -> TermType Lang -> [a] -> m [DocId]
+insertMasterDocs c lang hs = do
+ (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster corpusMasterName c
+
+ -- TODO Type NodeDocumentUnicised
+ let hs' = map addUniqId hs
+ ids <- insertDb masterUserId masterCorpusId hs'
+ let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
+
+ maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang) documentsWithId
+ terms2id <- insertNgrams $ Map.keys maps
+ let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
+
+ lId <- getOrMkList masterCorpusId masterUserId
+ _ <- insertDocNgrams lId indexedNgrams
+ pure $ map reId ids
+
+
+
+type CorpusName = Text
+
+getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
+ => Username -> CorpusName -> Maybe a
+ -> Cmd err (UserId, RootId, CorpusId)
+getOrMkRootWithCorpus username cName c = do
+ maybeUserId <- getUser username
+ userId <- case maybeUserId of
+ Nothing -> nodeError NoUserFound
+ Just user -> pure $ userLight_id user
+
+ rootId' <- map _node_id <$> getRoot username
rootId'' <- case rootId' of
- [] -> runCmd' (mkRoot userId)
- un -> case length un >= 2 of
- True -> panic "Error: more than 1 userNode / user"
- False -> pure rootId'
- let rootId = maybe (panic "error rootId") identity (head rootId'')
+ [] -> mkRoot username userId
+ n -> case length n >= 2 of
+ True -> nodeError ManyNodeUsers
+ False -> pure rootId'
+
+ rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
+
+ corpusId'' <- if username == userMaster
+ then do
+ ns <- getCorporaWithParentId rootId
+ pure $ map _node_id ns
+ else
+ pure []
+
+ corpusId' <- if corpusId'' /= []
+ then pure corpusId''
+ else mk (Just cName) c rootId userId
+
+ corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
- corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
- let corpusId = maybe (panic "error corpusId") identity (head corpusId')
-
- printDebug "(username, userId, rootId, corpusId"
- (username, userId, rootId, corpusId)
pure (userId, rootId, corpusId)
-flow :: FilePath -> IO Int
-flow fp = do
+------------------------------------------------------------------------
- (masterUserId, _, corpusId) <- subFlow "gargantua"
- docs <- map addUniqIds <$> parseDocs WOS fp
- ids <- runCmd' $ insertDocuments masterUserId corpusId docs
- printDebug "Docs IDs : " ids
+class UniqId a
+ where
+ uniqId :: Lens' a (Maybe HashId)
- idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs
- printDebug "Docs IDs : " idsRepeat
- (_, _, corpusId2) <- subFlow "alexandre"
+instance UniqId HyperdataDocument
+ where
+ uniqId = hyperdataDocument_uniqId
- inserted <- runCmd' $ add corpusId2 (map reId ids)
- printDebug "Inserted : " inserted
+instance UniqId HyperdataContact
+ where
+ uniqId = hc_uniqId
- runCmd' $ del [corpusId2, corpusId]
+viewUniqId' :: UniqId a => a -> (HashId, a)
+viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
+ where
+ err = panic "[ERROR] Database.Flow.toInsert"
-----------------------------------------------------------------
-type HashId = Text
-type NodeId = Int
-type ToInsert = Map HashId HyperdataDocument
-type Inserted = Map HashId ReturnId
-toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
-toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
+toInserted :: [ReturnId] -> Map HashId ReturnId
+toInserted = Map.fromList . map (\r -> (reUniqId r, r) )
+ . filter (\r -> reInserted r == True)
+
+data DocumentWithId a = DocumentWithId
+ { documentId :: !NodeId
+ , documentData :: !a
+ } deriving (Show)
+
+mergeData :: Map HashId ReturnId
+ -> Map HashId a
+ -> [DocumentWithId a]
+mergeData rs = catMaybes . map toDocumentWithId . Map.toList
where
- hash = maybe "Error" identity
+ toDocumentWithId (hash,hpd) =
+ DocumentWithId <$> fmap reId (lookup hash rs)
+ <*> Just hpd
-toInserted :: [ReturnId] -> Map HashId ReturnId
-toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
- $ filter (\r -> reInserted r == True) rs
+------------------------------------------------------------------------
+data DocumentIdWithNgrams a = DocumentIdWithNgrams
+ { documentWithId :: !(DocumentWithId a)
+ , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
+ } deriving (Show)
-data DocumentWithId = DocumentWithId { documentId :: NodeId
- , documentData :: HyperdataDocument
- }
+-- TODO extractNgrams according to Type of Data
+
+class ExtractNgramsT h
+ where
+ extractNgramsT :: TermType Lang -> h -> Cmd err (Map Ngrams (Map NgramsType Int))
-mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
-mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
+instance ExtractNgramsT HyperdataContact
where
- lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
+ extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
+ where
+ extract :: TermType Lang -> HyperdataContact
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extract _l hc' = do
+ let authors = map text2ngrams
+ $ maybe ["Nothing"] (\a -> [a])
+ $ view (hc_who . _Just . cw_lastName) hc'
+
+ pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
-data DocumentIdWithNgrams = DocumentIdWithNgrams { documentWithId :: DocumentWithId
- , document_ngrams :: Map (NgramsT Ngrams)Int
- }
-documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
- -> [DocumentWithId] -> [DocumentIdWithNgrams]
-documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
--- | TODO check optimization
-mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
-mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
+instance ExtractNgramsT HyperdataDocument
where
- xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
- n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
-
-indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
- -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
-indexNgrams ng2nId = do
- terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
- pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
+ extractNgramsT = extractNgramsT'
+extractNgramsT' :: TermType Lang -> HyperdataDocument
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+extractNgramsT' lang hd = filterNgramsT 255 <$> extractNgramsT'' lang hd
+ where
+ extractNgramsT'' :: TermType Lang -> HyperdataDocument
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extractNgramsT'' lang' doc = do
+ let source = text2ngrams
+ $ maybe "Nothing" identity
+ $ _hyperdataDocument_source doc
+
+ institutes = map text2ngrams
+ $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
+ $ _hyperdataDocument_institutes doc
+
+ authors = map text2ngrams
+ $ maybe ["Nothing"] (splitOn ", ")
+ $ _hyperdataDocument_authors doc
+
+ leText = catMaybes [ _hyperdataDocument_title doc
+ , _hyperdataDocument_abstract doc
+ ]
+
+ terms' <- map text2ngrams
+ <$> map (intercalate " " . _terms_label)
+ <$> concat
+ <$> liftIO (extractTerms lang' leText)
+
+ pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
+ <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
+ <> [(a', Map.singleton Authors 1) | a' <- authors ]
+ <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
+
+
+filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
+ -> Map Ngrams (Map NgramsType Int)
+filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
+ where
+ filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
+ True -> (ng,y)
+ False -> (Ngrams (Text.take s' t) n , y)
-insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
-insertToNodeNgrams m = insertNodeNgrams $ [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
- (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
- | (ng, nId2int) <- DM.toList m
- , (nId, n) <- DM.toList nId2int
- ]
+documentIdWithNgrams :: HasNodeError err
+ => (a
+ -> Cmd err (Map Ngrams (Map NgramsType Int)))
+ -> [DocumentWithId a]
+ -> Cmd err [DocumentIdWithNgrams a]
+documentIdWithNgrams f = mapM toDocumentIdWithNgrams
+ where
+ toDocumentIdWithNgrams d = do
+ e <- f $ documentData d
+ pure $ DocumentIdWithNgrams d e
--- mk ListGroup
--- groupBy fun
--- insertInto NodeNgramsNgrams
--- compute Candidate / Map
--- add column typelist
--- insertNodeNodeNgram
--- get data of NgramsTable
--- post :: update NodeNodeNgrams
--- group ngrams
+-- FLOW LIST
+-- | TODO check optimization
+mapNodeIdNgrams :: [DocumentIdWithNgrams a]
+ -> Map Ngrams (Map NgramsType (Map NodeId Int))
+mapNodeIdNgrams = Map.unionsWith (Map.unionWith (Map.unionWith (+))) . fmap f
+ where
+ f :: DocumentIdWithNgrams a
+ -> Map Ngrams (Map NgramsType (Map NodeId Int))
+ f d = fmap (fmap (Map.singleton nId)) $ document_ngrams d
+ where
+ nId = documentId $ documentWithId d
+
+------------------------------------------------------------------------
+listInsert :: FlowCmdM env err m
+ => ListId -> Map NgramsType [NgramsElement]
+ -> m ()
+listInsert lId ngs = mapM_ (\(typeList, ngElmts)
+ -> putListNgrams lId typeList ngElmts
+ ) $ toList ngs
+
+flowList :: FlowCmdM env err m => UserId -> CorpusId
+ -> Map NgramsType [NgramsElement]
+ -> m ListId
+flowList uId cId ngs = do
+ lId <- getOrMkList cId uId
+ printDebug "listId flowList" lId
+ listInsert lId ngs
+ --trace (show $ List.filter (\n -> _ne_ngrams n == "versatile") $ List.concat $ Map.elems ngs) $ listInsert lId ngs
+ pure lId