2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Database.Metrics.TFICF (getTficf)
37 import Gargantext.Text.Terms (extractTerms)
38 import Gargantext.Text.Metrics.TFICF (Tficf(..))
39 import Gargantext.Database.Node.Document.Add (add)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
41 import Gargantext.Database.Root (getRoot)
42 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
43 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
44 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
45 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
46 import Gargantext.Database.Schema.User (getUser, UserLight(..))
47 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
48 import Gargantext.Database.Utils (Cmd)
49 import Gargantext.Text.Terms (TermType(..))
50 import Gargantext.Ext.IMT (toSchoolName)
51 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
52 import Gargantext.Prelude
53 import Gargantext.Text.Parsers (parseDocs, FileFormat)
54 import System.FilePath (FilePath)
55 import qualified Data.Map as DM
58 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
59 flowCorpus ff fp cName = do
60 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
61 params <- flowInsert NodeCorpus hyperdataDocuments' cName
62 flowCorpus' NodeCorpus hyperdataDocuments' params
65 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
66 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
67 flowInsert _nt hyperdataDocuments cName = do
68 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
70 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
71 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
73 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
74 _ <- add userCorpusId (map reId ids)
76 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
79 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
80 flowAnnuaire filePath = do
81 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
82 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
83 printDebug "length annuaire" ps
86 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
87 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
88 flowInsertAnnuaire name children = do
90 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
91 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
93 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
94 _ <- add userCorpusId (map reId ids)
96 --printDebug "AnnuaireID" userCorpusId
98 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
101 -- check userId CanFillUserCorpus userCorpusId
102 -- check masterUserId CanFillMasterCorpus masterCorpusId
106 -- InsertedNodeNgrams ?
107 flowCorpus' :: HasNodeError err
108 => NodeType -> [HyperdataDocument]
109 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
111 flowCorpus' NodeCorpus hyperdataDocuments (ids,_masterUserId,_masterCorpusId, userId,userCorpusId) = do
112 --------------------------------------------------
114 _userListId <- flowListUser userId userCorpusId 500
115 --printDebug "Working on User ListId : " userListId
117 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
118 -- printDebug "documentsWithId" documentsWithId
119 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
120 -- printDebug "docsWithNgrams" docsWithNgrams
121 let maps = mapNodeIdNgrams docsWithNgrams
123 -- printDebug "maps" (maps)
124 terms2id <- insertNgrams $ DM.keys maps
125 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
126 -- printDebug "inserted ngrams" indexedNgrams
127 _ <- insertToNodeNgrams indexedNgrams
129 --listId2 <- flowList masterUserId masterCorpusId indexedNgrams
130 --printDebug "Working on ListId : " listId2
132 --------------------------------------------------
133 _ <- mkDashboard userCorpusId userId
134 _ <- mkGraph userCorpusId userId
137 -- _ <- mkAnnuaire rootUserId userId
140 -- del [corpusId2, corpusId]
142 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
143 flowCorpus' _ _ _ = undefined
146 type CorpusName = Text
148 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
149 subFlowCorpus username cName = do
150 maybeUserId <- getUser username
152 userId <- case maybeUserId of
153 Nothing -> nodeError NoUserFound
154 -- mk NodeUser gargantua_id "Node Gargantua"
155 Just user -> pure $ userLight_id user
157 rootId' <- map _node_id <$> getRoot username
159 rootId'' <- case rootId' of
160 [] -> mkRoot username userId
161 n -> case length n >= 2 of
162 True -> nodeError ManyNodeUsers
163 False -> pure rootId'
164 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
166 corpusId'' <- if username == userMaster
168 ns <- getCorporaWithParentId rootId
169 pure $ map _node_id ns
173 corpusId' <- if corpusId'' /= []
175 else mkCorpus (Just cName) Nothing rootId userId
177 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
179 --printDebug "(username, userId, rootId, corpusId)"
180 -- (username, userId, rootId, corpusId)
181 pure (userId, rootId, corpusId)
184 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
185 subFlowAnnuaire username _cName = do
186 maybeUserId <- getUser username
188 userId <- case maybeUserId of
189 Nothing -> nodeError NoUserFound
190 -- mk NodeUser gargantua_id "Node Gargantua"
191 Just user -> pure $ userLight_id user
193 rootId' <- map _node_id <$> getRoot username
195 rootId'' <- case rootId' of
196 [] -> mkRoot username userId
197 n -> case length n >= 2 of
198 True -> nodeError ManyNodeUsers
199 False -> pure rootId'
200 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
202 corpusId' <- mkAnnuaire rootId userId
204 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
206 --printDebug "(username, userId, rootId, corpusId)"
207 -- (username, userId, rootId, corpusId)
208 pure (userId, rootId, corpusId)
210 ------------------------------------------------------------------------
211 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
212 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
214 err = "Database.Flow.toInsert"
216 toInserted :: [ReturnId] -> Map HashId ReturnId
217 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
218 . filter (\r -> reInserted r == True)
220 data DocumentWithId =
221 DocumentWithId { documentId :: !NodeId
222 , documentData :: !HyperdataDocument
225 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
226 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
228 toDocumentWithId (hash,hpd) =
229 DocumentWithId <$> fmap reId (lookup hash rs)
232 ------------------------------------------------------------------------
233 data DocumentIdWithNgrams =
235 { documentWithId :: !DocumentWithId
236 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
240 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
241 extractNgramsT doc = do
242 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
243 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
244 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
245 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
246 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
248 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
249 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
250 <> [(a', DM.singleton Authors 1) | a' <- authors ]
251 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
255 documentIdWithNgrams :: HasNodeError err
256 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
257 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
258 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
260 toDocumentIdWithNgrams d = do
261 e <- f $ documentData d
262 pure $ DocumentIdWithNgrams d e
264 -- | TODO check optimization
265 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
266 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
268 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
269 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
271 nId = documentId $ documentWithId d
273 ------------------------------------------------------------------------
274 flowList :: HasNodeError err => UserId -> CorpusId
275 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
276 flowList uId cId _ngs = do
277 -- printDebug "ngs:" ngs
278 lId <- getOrMkList cId uId
279 --printDebug "ngs" (DM.keys ngs)
280 -- TODO add stemming equivalence of 2 ngrams
282 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
283 -- _ <- insertGroups lId groupEd
285 -- compute Candidate / Map
286 --is <- insertLists lId $ ngrams2list ngs
287 --printDebug "listNgrams inserted :" is
291 flowListUser :: HasNodeError err => UserId -> CorpusId -> Int -> Cmd err NodeId
292 flowListUser uId cId n = do
293 lId <- getOrMkList cId uId
294 -- is <- insertLists lId $ ngrams2list ngs
296 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
297 _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
301 ------------------------------------------------------------------------
306 * DM.keys called twice
307 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
308 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
309 -> Map NgramsIndexed NgramsIndexed
310 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
314 -- TODO check: do not insert duplicates
315 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
316 insertGroups lId ngrs =
317 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
318 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
322 ------------------------------------------------------------------------
323 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
324 -> [(ListType, (NgramsType,NgramsIndexed))]
326 [ (CandidateList, (t, ng))
327 | (ng, tm) <- DM.toList m
331 -- | TODO: weight of the list could be a probability
332 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
333 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
334 | (l,(ngt, ng)) <- lngs
336 ------------------------------------------------------------------------