Stability : experimental
Portability : POSIX
+-- TODO-ACCESS:
+-- check userId CanFillUserCorpus userCorpusId
+-- check masterUserId CanFillMasterCorpus masterCorpusId
+
+-- TODO-ACCESS: check uId CanInsertDoc pId && checkDocType nodeType
+-- TODO-EVENTS: InsertedNodes
-}
-{-# LANGUAGE ConstraintKinds #-}
-{-# LANGUAGE DeriveGeneric #-}
-{-# LANGUAGE NoImplicitPrelude #-}
-{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE RankNTypes #-}
+{-# OPTIONS_GHC -fno-warn-orphans #-}
+
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE RankNTypes #-}
+{-# LANGUAGE ConstrainedClassMethods #-}
+{-# LANGUAGE ConstraintKinds #-}
+{-# LANGUAGE DeriveGeneric #-}
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE InstanceSigs #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
+ ( FlowCmdM
+ , flowCorpusFile
+ , flowCorpus
+ , flowCorpusSearchInDatabase
+ , getOrMkRoot
+ , getOrMkRootWithCorpus
+ , flowAnnuaire
+ )
where
-
---import Control.Lens (view)
+import Prelude (String)
+import Data.Either
+import Debug.Trace (trace)
+import Control.Lens ((^.), view, _Just)
import Control.Monad.IO.Class (liftIO)
---import Gargantext.Core.Types
---import Gargantext.Database.Node.Contact (HyperdataContact(..))
+import Data.List (concat)
import Data.Map (Map, lookup)
import Data.Maybe (Maybe(..), catMaybes)
import Data.Monoid
import Data.Text (Text, splitOn, intercalate)
-import Data.Tuple.Extra (both)
-import Data.List (concat)
-import GHC.Show (Show)
-import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Types (NodePoly(..), Terms(..))
import Gargantext.Core.Types.Individu (Username)
+import Gargantext.Core.Flow
import Gargantext.Core.Types.Main
-import Gargantext.Core (Lang(..))
-import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
-import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
-import Gargantext.Database.Metrics.TFICF (getTficf)
-import Gargantext.Text.Terms (extractTerms)
-import Gargantext.Text.Metrics.TFICF (Tficf(..))
-import Gargantext.Database.Node.Document.Add (add)
-import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
+import Gargantext.Database.Config (userMaster, corpusMasterName)
+import Gargantext.Database.Flow.Utils (insertDocNgrams)
+import Gargantext.Database.Flow.List
+import Gargantext.Database.Flow.Types
+import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
+import Gargantext.Database.Node.Document.Insert -- (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Root (getRoot)
-import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
-import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
-import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
-import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
+import Gargantext.Database.Schema.Ngrams -- (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
+import Gargantext.Database.Schema.Node -- (mkRoot, mkCorpus, getOrMkList, mkGraph, {-mkPhylo,-} mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
import Gargantext.Database.Schema.User (getUser, UserLight(..))
-import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
-import Gargantext.Database.Utils (Cmd, CmdM)
-import Gargantext.Text.Terms (TermType(..))
+import Gargantext.Database.TextSearch (searchInDatabase)
+import Gargantext.Database.Types.Node -- (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
+import Gargantext.Database.Utils (Cmd)
import Gargantext.Ext.IMT (toSchoolName)
import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
import Gargantext.Prelude
-import Gargantext.Text.Parsers (parseDocs, FileFormat)
+import Gargantext.Text.Terms.Eleve (buildTries, toToken)
+import Gargantext.Text.List (buildNgramsLists,StopSize(..))
+import Gargantext.Text.Corpus.Parsers (parseFile, FileFormat)
+import qualified Gargantext.Text.Corpus.API.Isidore as Isidore
+import Gargantext.Text.Terms (TermType(..), tt_lang, extractTerms, uniText)
+import Gargantext.Text.Terms.Mono.Stem.En (stemIt)
+import Gargantext.Prelude.Utils hiding (sha)
import System.FilePath (FilePath)
+import qualified Data.List as List
+import qualified Data.Map as Map
+import qualified Data.Text as Text
+import qualified Gargantext.Database.Node.Document.Add as Doc (add)
+import qualified Gargantext.Text.Corpus.Parsers.GrandDebat as GD
-import Gargantext.API.Ngrams (NgramsElement(..), insertNewListOfNgramsElements, RepoCmdM)
-
-import qualified Data.Map as DM
-
-type FlowCmdM env err m =
- ( CmdM env err m
- , RepoCmdM env err m
- , HasNodeError err
- )
-
-flowCorpus :: FlowCmdM env err m => FileFormat -> FilePath -> CorpusName -> m CorpusId
-flowCorpus ff fp cName = do
- hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
- params <- flowInsert NodeCorpus hyperdataDocuments' cName
- flowCorpus' NodeCorpus hyperdataDocuments' params
-
-
-flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
- -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
-flowInsert _nt hyperdataDocuments cName = do
- let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
-
- (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
- ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
-
- (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
- _ <- add userCorpusId (map reId ids)
-
- pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
-
-
-flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
-flowAnnuaire filePath = do
- contacts <- liftIO $ deserialiseImtUsersFromFile filePath
- ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
- printDebug "length annuaire" ps
-
+------------------------------------------------------------------------
-flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
- -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
-flowInsertAnnuaire name children = do
+data ApiQuery = ApiIsidoreQuery Text | ApiIsidoreAuth Text
+-- | APIs
+-- TODO instances
+getDataApi :: Lang
+ -> Maybe Limit
+ -> ApiQuery
+ -> IO [HyperdataDocument]
+getDataApi lang limit (ApiIsidoreQuery q) = Isidore.get lang limit (Just q) Nothing
+getDataApi lang limit (ApiIsidoreAuth q) = Isidore.get lang limit Nothing (Just q)
+
+
+-- UNUSED
+_flowCorpusApi :: ( FlowCmdM env err m)
+ => Username -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> Maybe Limit
+ -> ApiQuery
+ -> m CorpusId
+_flowCorpusApi u n tt l q = do
+ docs <- liftIO $ splitEvery 500 <$> getDataApi (_tt_lang tt) l q
+ flowCorpus u n tt docs
- (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
- ids <- insertDocuments masterUserId masterCorpusId NodeContact children
+------------------------------------------------------------------------
- (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
- _ <- add userCorpusId (map reId ids)
+flowAnnuaire :: FlowCmdM env err m
+ => Username
+ -> Either CorpusName [CorpusId]
+ -> (TermType Lang)
+ -> FilePath
+ -> m AnnuaireId
+flowAnnuaire u n l filePath = do
+ docs <- liftIO $ (( splitEvery 500 <$> deserialiseImtUsersFromFile filePath) :: IO [[HyperdataContact]])
+ flow (Nothing :: Maybe HyperdataAnnuaire) u n l docs
+
+-- UNUSED
+_flowCorpusDebat :: FlowCmdM env err m
+ => Username -> Either CorpusName [CorpusId]
+ -> Limit -> FilePath
+ -> m CorpusId
+_flowCorpusDebat u n l fp = do
+ docs <- liftIO ( splitEvery 500
+ <$> take l
+ <$> readFile' fp
+ :: IO [[GD.GrandDebatReference ]]
+ )
+ flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
+
+flowCorpusFile :: FlowCmdM env err m
+ => Username -> Either CorpusName [CorpusId]
+ -> Limit -- Limit the number of docs (for dev purpose)
+ -> TermType Lang -> FileFormat -> FilePath
+ -> m CorpusId
+flowCorpusFile u n l la ff fp = do
+ docs <- liftIO ( splitEvery 500
+ <$> take l
+ <$> parseFile ff fp
+ )
+ flowCorpus u n la (map (map toHyperdataDocument) docs)
+
+-- TODO query with complex query
+flowCorpusSearchInDatabase :: FlowCmdM env err m
+ => Username
+ -> Lang
+ -> Text
+ -> m CorpusId
+flowCorpusSearchInDatabase u la q = do
+ (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
+ userMaster
+ (Left "")
+ (Nothing :: Maybe HyperdataCorpus)
+ ids <- map fst <$> searchInDatabase cId (stemIt q)
+ flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
+
+
+-- UNUSED
+_flowCorpusSearchInDatabaseApi :: FlowCmdM env err m
+ => Username
+ -> Lang
+ -> Text
+ -> m CorpusId
+_flowCorpusSearchInDatabaseApi u la q = do
+ (_masterUserId, _masterRootId, cId) <- getOrMkRootWithCorpus
+ userMaster
+ (Left "")
+ (Nothing :: Maybe HyperdataCorpus)
+ ids <- map fst <$> searchInDatabase cId (stemIt q)
+ flowCorpusUser la u (Left q) (Nothing :: Maybe HyperdataCorpus) ids
- --printDebug "AnnuaireID" userCorpusId
+------------------------------------------------------------------------
+-- | TODO improve the needed type to create/update a corpus
+{- UNUSED
+data UserInfo = Username Text
+ | UserId NodeId
+data CorpusInfo = CorpusName Lang Text
+ | CorpusId Lang NodeId
+-}
- pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
+flow :: (FlowCmdM env err m, FlowCorpus a, MkCorpus c)
+ => Maybe c
+ -> Username
+ -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> [[a]]
+ -> m CorpusId
+flow c u cn la docs = do
+ ids <- mapM (insertMasterDocs c la ) docs
+ flowCorpusUser (la ^. tt_lang) u cn c (concat ids)
+
+flowCorpus :: (FlowCmdM env err m, FlowCorpus a)
+ => Username
+ -> Either CorpusName [CorpusId]
+ -> TermType Lang
+ -> [[a]]
+ -> m CorpusId
+flowCorpus = flow (Nothing :: Maybe HyperdataCorpus)
--- TODO-ACCESS:
--- check userId CanFillUserCorpus userCorpusId
--- check masterUserId CanFillMasterCorpus masterCorpusId
---
--- TODO-EVENTS:
--- InsertedNgrams ?
--- InsertedNodeNgrams ?
-flowCorpus' :: FlowCmdM env err m
- => NodeType -> [HyperdataDocument]
- -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
- -> m CorpusId
-flowCorpus' NodeCorpus hyperdataDocuments (ids,_masterUserId,_masterCorpusId, userId,userCorpusId) = do
---------------------------------------------------
- -- List Ngrams Flow
- _userListId <- flowListUser userId userCorpusId 500
- --printDebug "Working on User ListId : " userListId
-
- let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
- -- printDebug "documentsWithId" documentsWithId
- docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
- -- printDebug "docsWithNgrams" docsWithNgrams
- let maps = mapNodeIdNgrams docsWithNgrams
-
- -- printDebug "maps" (maps)
- terms2id <- insertNgrams $ DM.keys maps
- let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
- -- printDebug "inserted ngrams" indexedNgrams
- _ <- insertToNodeNgrams indexedNgrams
-
- --listId2 <- flowList masterUserId masterCorpusId indexedNgrams
- --printDebug "Working on ListId : " listId2
- --}
---------------------------------------------------
+------------------------------------------------------------------------
+flowCorpusUser :: (FlowCmdM env err m, MkCorpus c)
+ => Lang
+ -> Username
+ -> Either CorpusName [CorpusId]
+ -> Maybe c
+ -> [NodeId]
+ -> m CorpusId
+flowCorpusUser l userName corpusName ctype ids = do
+ -- User Flow
+ (userId, _rootId, userCorpusId) <- getOrMkRootWithCorpus userName corpusName ctype
+ listId <- getOrMkList userCorpusId userId
+ _cooc <- mkNode NodeListCooc listId userId
+ -- TODO: check if present already, ignore
+ _ <- Doc.add userCorpusId ids
+
+ _tId <- mkNode NodeTexts userCorpusId userId
+ -- printDebug "Node Text Id" tId
+
+ -- User List Flow
+ --{-
+ (_masterUserId, _masterRootId, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left "") ctype
+ ngs <- buildNgramsLists l 2 3 (StopSize 3) userCorpusId masterCorpusId
+ _userListId <- flowList listId ngs
+ --mastListId <- getOrMkList masterCorpusId masterUserId
+ -- _ <- insertOccsUpdates userCorpusId mastListId
+ -- printDebug "userListId" userListId
+ -- User Graph Flow
_ <- mkDashboard userCorpusId userId
- _ <- mkGraph userCorpusId userId
+ _ <- mkGraph userCorpusId userId
+ --_ <- mkPhylo userCorpusId userId
+ --}
+
-- Annuaire Flow
-- _ <- mkAnnuaire rootUserId userId
-
pure userCorpusId
- -- del [corpusId2, corpusId]
-flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
-flowCorpus' _ _ _ = undefined
+
+insertMasterDocs :: ( FlowCmdM env err m
+ , FlowCorpus a
+ , MkCorpus c
+ )
+ => Maybe c
+ -> TermType Lang
+ -> [a]
+ -> m [DocId]
+insertMasterDocs c lang hs = do
+ (masterUserId, _, masterCorpusId) <- getOrMkRootWithCorpus userMaster (Left corpusMasterName) c
+
+ -- TODO Type NodeDocumentUnicised
+ let hs' = map addUniqId hs
+ ids <- insertDb masterUserId masterCorpusId hs'
+ let documentsWithId = mergeData (toInserted ids) (Map.fromList $ map viewUniqId' hs')
+
+ let
+ fixLang (Unsupervised l n s m) = Unsupervised l n s m'
+ where
+ m' = case m of
+ Nothing -> trace ("buildTries here" :: String)
+ $ Just
+ $ buildTries n ( fmap toToken $ uniText
+ $ Text.intercalate " . "
+ $ List.concat
+ $ map hasText documentsWithId
+ )
+ just_m -> just_m
+ fixLang l = l
+
+ lang' = fixLang lang
+ -- maps :: IO Map Ngrams (Map NgramsType (Map NodeId Int))
+ maps <- mapNodeIdNgrams <$> documentIdWithNgrams (extractNgramsT lang') documentsWithId
+ terms2id <- insertNgrams $ Map.keys maps
+ let indexedNgrams = Map.mapKeys (indexNgrams terms2id) maps
+
+ lId <- getOrMkList masterCorpusId masterUserId
+ _cooc <- mkNode NodeListCooc lId masterUserId
+ _ <- insertDocNgrams lId indexedNgrams
+
+ pure $ map reId ids
type CorpusName = Text
-subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
-subFlowCorpus username cName = do
- maybeUserId <- getUser username
+getOrMkRoot :: (HasNodeError err)
+ => Username
+ -> Cmd err (UserId, RootId)
+getOrMkRoot username = do
+ maybeUserId <- getUser username
userId <- case maybeUserId of
Nothing -> nodeError NoUserFound
- -- mk NodeUser gargantua_id "Node Gargantua"
Just user -> pure $ userLight_id user
rootId' <- map _node_id <$> getRoot username
n -> case length n >= 2 of
True -> nodeError ManyNodeUsers
False -> pure rootId'
+
rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
+ pure (userId, rootId)
+
+getOrMkRootWithCorpus :: (HasNodeError err, MkCorpus a)
+ => Username
+ -> Either CorpusName [CorpusId]
+ -> Maybe a
+ -> Cmd err (UserId, RootId, CorpusId)
+getOrMkRootWithCorpus username cName c = do
+ (userId, rootId) <- getOrMkRoot username
corpusId'' <- if username == userMaster
then do
ns <- getCorporaWithParentId rootId
pure $ map _node_id ns
else
- pure []
+ pure $ fromRight [] cName
corpusId' <- if corpusId'' /= []
then pure corpusId''
- else mkCorpus (Just cName) Nothing rootId userId
+ else mk (Just $ fromLeft "Default" cName) c rootId userId
corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
- --printDebug "(username, userId, rootId, corpusId)"
- -- (username, userId, rootId, corpusId)
pure (userId, rootId, corpusId)
-subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
-subFlowAnnuaire username _cName = do
- maybeUserId <- getUser username
-
- userId <- case maybeUserId of
- Nothing -> nodeError NoUserFound
- -- mk NodeUser gargantua_id "Node Gargantua"
- Just user -> pure $ userLight_id user
-
- rootId' <- map _node_id <$> getRoot username
-
- rootId'' <- case rootId' of
- [] -> mkRoot username userId
- n -> case length n >= 2 of
- True -> nodeError ManyNodeUsers
- False -> pure rootId'
- rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
-
- corpusId' <- mkAnnuaire rootId userId
-
- corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
-
- --printDebug "(username, userId, rootId, corpusId)"
- -- (username, userId, rootId, corpusId)
- pure (userId, rootId, corpusId)
-
------------------------------------------------------------------------
-toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
-toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
- where
- err = "Database.Flow.toInsert"
+viewUniqId' :: UniqId a
+ => a
+ -> (HashId, a)
+viewUniqId' d = maybe err (\h -> (h,d)) (view uniqId d)
+ where
+ err = panic "[ERROR] Database.Flow.toInsert"
-toInserted :: [ReturnId] -> Map HashId ReturnId
-toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
- . filter (\r -> reInserted r == True)
-data DocumentWithId =
- DocumentWithId { documentId :: !NodeId
- , documentData :: !HyperdataDocument
- } deriving (Show)
+toInserted :: [ReturnId]
+ -> Map HashId ReturnId
+toInserted =
+ Map.fromList . map (\r -> (reUniqId r, r) )
+ . filter (\r -> reInserted r == True)
-mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
-mergeData rs = catMaybes . map toDocumentWithId . DM.toList
+mergeData :: Map HashId ReturnId
+ -> Map HashId a
+ -> [DocumentWithId a]
+mergeData rs = catMaybes . map toDocumentWithId . Map.toList
where
- toDocumentWithId (hash,hpd) =
- DocumentWithId <$> fmap reId (lookup hash rs)
+ toDocumentWithId (sha,hpd) =
+ DocumentWithId <$> fmap reId (lookup sha rs)
<*> Just hpd
------------------------------------------------------------------------
-data DocumentIdWithNgrams =
- DocumentIdWithNgrams
- { documentWithId :: !DocumentWithId
- , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
- } deriving (Show)
--- TODO group terms
-extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
-extractNgramsT doc = do
- let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
- let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
- let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
- let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
- terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
+instance HasText HyperdataContact
+ where
+ hasText = undefined
- pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
- <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
- <> [(a', DM.singleton Authors 1) | a' <- authors ]
- <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
+instance ExtractNgramsT HyperdataContact
+ where
+ extractNgramsT l hc = filterNgramsT 255 <$> extract l hc
+ where
+ extract :: TermType Lang -> HyperdataContact
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extract _l hc' = do
+ let authors = map text2ngrams
+ $ maybe ["Nothing"] (\a -> [a])
+ $ view (hc_who . _Just . cw_lastName) hc'
+
+ pure $ Map.fromList $ [(a', Map.singleton Authors 1) | a' <- authors ]
+
+instance HasText HyperdataDocument
+ where
+ hasText h = catMaybes [ _hyperdataDocument_title h
+ , _hyperdataDocument_abstract h
+ ]
+instance ExtractNgramsT HyperdataDocument
+ where
+ extractNgramsT :: TermType Lang
+ -> HyperdataDocument
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extractNgramsT lang hd = filterNgramsT 255 <$> extractNgramsT' lang hd
+ where
+ extractNgramsT' :: TermType Lang
+ -> HyperdataDocument
+ -> Cmd err (Map Ngrams (Map NgramsType Int))
+ extractNgramsT' lang' doc = do
+ let source = text2ngrams
+ $ maybe "Nothing" identity
+ $ _hyperdataDocument_source doc
+
+ institutes = map text2ngrams
+ $ maybe ["Nothing"] (map toSchoolName . (splitOn ", "))
+ $ _hyperdataDocument_institutes doc
+
+ authors = map text2ngrams
+ $ maybe ["Nothing"] (splitOn ", ")
+ $ _hyperdataDocument_authors doc
+
+ terms' <- map text2ngrams
+ <$> map (intercalate " " . _terms_label)
+ <$> concat
+ <$> liftIO (extractTerms lang' $ hasText doc)
+
+ pure $ Map.fromList $ [(source, Map.singleton Sources 1)]
+ <> [(i', Map.singleton Institutes 1) | i' <- institutes ]
+ <> [(a', Map.singleton Authors 1) | a' <- authors ]
+ <> [(t', Map.singleton NgramsTerms 1) | t' <- terms' ]
+
+filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
+ -> Map Ngrams (Map NgramsType Int)
+filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
+ where
+ filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
+ True -> (ng,y)
+ False -> (Ngrams (Text.take s' t) n , y)
documentIdWithNgrams :: HasNodeError err
- => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
- -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
+ => (a
+ -> Cmd err (Map Ngrams (Map NgramsType Int)))
+ -> [DocumentWithId a]
+ -> Cmd err [DocumentIdWithNgrams a]
documentIdWithNgrams f = mapM toDocumentIdWithNgrams
where
toDocumentIdWithNgrams d = do
- e <- f $ documentData d
- pure $ DocumentIdWithNgrams d e
-
--- | TODO check optimization
-mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
-mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
- where
- f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
- f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
- where
- nId = documentId $ documentWithId d
-
-------------------------------------------------------------------------
-flowList :: HasNodeError err => UserId -> CorpusId
- -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
-flowList uId cId _ngs = do
- -- printDebug "ngs:" ngs
- lId <- getOrMkList cId uId
- --printDebug "ngs" (DM.keys ngs)
- -- TODO add stemming equivalence of 2 ngrams
- -- TODO needs rework
- -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
- -- _ <- insertGroups lId groupEd
-
--- compute Candidate / Map
- --is <- insertLists lId $ ngrams2list ngs
- --printDebug "listNgrams inserted :" is
-
- pure lId
-
-flowListUser :: FlowCmdM env err m
- => UserId -> CorpusId -> Int -> m NodeId
-flowListUser uId cId n = do
- lId <- getOrMkList cId uId
- -- is <- insertLists lId $ ngrams2list ngs
-
- ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
--- _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
-
- insertNewListOfNgramsElements lId NgramsTerms $
- [ NgramsElement (tficf_ngramsTerms ng) GraphList 1 Nothing mempty
- | ng <- ngs ]
-
- pure lId
-
-------------------------------------------------------------------------
-
-{-
- TODO rework:
- * quadratic
- * DM.keys called twice
-groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
- -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
- -> Map NgramsIndexed NgramsIndexed
-groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
--}
-
-
--- TODO check: do not insert duplicates
-insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
-insertGroups lId ngrs =
- insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
- | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
- , ng1 /= ng2
- ]
-
-------------------------------------------------------------------------
-ngrams2list :: Map NgramsIndexed (Map NgramsType a)
- -> [(ListType, (NgramsType,NgramsIndexed))]
-ngrams2list m =
- [ (CandidateList, (t, ng))
- | (ng, tm) <- DM.toList m
- , t <- DM.keys tm
- ]
-
--- | TODO: weight of the list could be a probability
-insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
-insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
- | (l,(ngt, ng)) <- lngs
- ]
-------------------------------------------------------------------------
+ e <- f $ documentData d
+ pure $ DocumentIdWithNgrams d e