2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15 {-# LANGUAGE RankNTypes #-}
17 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
20 --import Control.Lens (view)
21 import Control.Monad.IO.Class (liftIO)
22 --import Gargantext.Core.Types
23 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
24 import Data.Map (Map, lookup)
25 import Data.Maybe (Maybe(..), catMaybes)
26 import Data.Text (Text, splitOn, intercalate)
27 import Data.Tuple.Extra (both)
28 import Data.List (concat)
29 import GHC.Show (Show)
30 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId, Terms(..))
31 import Gargantext.Core.Types.Individu (Username)
32 import Gargantext.Core.Types.Main
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
35 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
36 import Gargantext.Database.Metrics.TFICF (getTficf)
37 import Gargantext.Text.Terms (extractTerms)
38 import Gargantext.Text.Metrics.TFICF (Tficf(..))
39 import Gargantext.Database.Node.Document.Add (add)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
41 import Gargantext.Database.Root (getRoot)
42 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsIndexed(..), indexNgrams, NgramsType(..), text2ngrams, ngramsTypeId)
43 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, getOrMkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId, HasNodeError, NodeError(..), nodeError)
44 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
45 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
46 import Gargantext.Database.Schema.User (getUser, UserLight(..))
47 import Gargantext.Database.Types.Node (HyperdataDocument(..), NodeType(..), NodeId, UserId, ListId, CorpusId, RootId, MasterCorpusId, MasterUserId)
48 import Gargantext.Database.Utils (Cmd)
49 import Gargantext.Text.Terms (TermType(..))
50 import Gargantext.Ext.IMT (toSchoolName)
51 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
52 import Gargantext.Prelude
53 import Gargantext.Text.Parsers (parseDocs, FileFormat)
54 import System.FilePath (FilePath)
55 import qualified Data.Map as DM
58 flowCorpus :: HasNodeError err => FileFormat -> FilePath -> CorpusName -> Cmd err CorpusId
59 flowCorpus ff fp cName = do
60 hyperdataDocuments' <- map addUniqIdsDoc <$> liftIO (parseDocs ff fp)
61 params <- flowInsert NodeCorpus hyperdataDocuments' cName
62 flowCorpus' NodeCorpus hyperdataDocuments' params
65 flowInsert :: HasNodeError err => NodeType -> [HyperdataDocument] -> CorpusName
66 -> Cmd err ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
67 flowInsert _nt hyperdataDocuments cName = do
68 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
70 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
71 ids <- insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
73 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
74 _ <- add userCorpusId (map reId ids)
76 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
79 flowAnnuaire :: HasNodeError err => FilePath -> Cmd err ()
80 flowAnnuaire filePath = do
81 contacts <- liftIO $ deserialiseImtUsersFromFile filePath
82 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
83 printDebug "length annuaire" ps
86 flowInsertAnnuaire :: HasNodeError err => CorpusName -> [ToDbData]
87 -> Cmd err ([ReturnId], UserId, CorpusId, UserId, CorpusId)
88 flowInsertAnnuaire name children = do
90 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
91 ids <- insertDocuments masterUserId masterCorpusId NodeContact children
93 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
94 _ <- add userCorpusId (map reId ids)
96 printDebug "AnnuaireID" userCorpusId
98 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
101 flowCorpus' :: HasNodeError err
102 => NodeType -> [HyperdataDocument]
103 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
105 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
106 --------------------------------------------------
108 userListId <- flowListUser userId userCorpusId 3000
109 printDebug "Working on User ListId : " userListId
111 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
112 -- printDebug "documentsWithId" documentsWithId
113 docsWithNgrams <- documentIdWithNgrams extractNgramsT documentsWithId
114 -- printDebug "docsWithNgrams" docsWithNgrams
115 let maps = mapNodeIdNgrams docsWithNgrams
117 -- printDebug "maps" (maps)
118 terms2id <- insertNgrams $ DM.keys maps
119 let indexedNgrams = DM.mapKeys (indexNgrams terms2id) maps
120 -- printDebug "inserted ngrams" indexedNgrams
121 _ <- insertToNodeNgrams indexedNgrams
123 listId2 <- flowList masterUserId masterCorpusId indexedNgrams
124 printDebug "Working on ListId : " listId2
126 --------------------------------------------------
127 _ <- mkDashboard userCorpusId userId
128 _ <- mkGraph userCorpusId userId
131 -- _ <- mkAnnuaire rootUserId userId
134 -- del [corpusId2, corpusId]
136 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
137 flowCorpus' _ _ _ = undefined
140 type CorpusName = Text
142 subFlowCorpus :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
143 subFlowCorpus username cName = do
144 maybeUserId <- getUser username
146 userId <- case maybeUserId of
147 Nothing -> nodeError NoUserFound
148 -- mk NodeUser gargantua_id "Node Gargantua"
149 Just user -> pure $ userLight_id user
151 rootId' <- map _node_id <$> getRoot username
153 rootId'' <- case rootId' of
154 [] -> mkRoot username userId
155 n -> case length n >= 2 of
156 True -> nodeError ManyNodeUsers
157 False -> pure rootId'
158 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
160 corpusId'' <- if username == userMaster
162 ns <- getCorporaWithParentId rootId
163 pure $ map _node_id ns
167 corpusId' <- if corpusId'' /= []
169 else mkCorpus (Just cName) Nothing rootId userId
171 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
173 printDebug "(username, userId, rootId, corpusId)"
174 (username, userId, rootId, corpusId)
175 pure (userId, rootId, corpusId)
178 subFlowAnnuaire :: HasNodeError err => Username -> CorpusName -> Cmd err (UserId, RootId, CorpusId)
179 subFlowAnnuaire username _cName = do
180 maybeUserId <- getUser username
182 userId <- case maybeUserId of
183 Nothing -> nodeError NoUserFound
184 -- mk NodeUser gargantua_id "Node Gargantua"
185 Just user -> pure $ userLight_id user
187 rootId' <- map _node_id <$> getRoot username
189 rootId'' <- case rootId' of
190 [] -> mkRoot username userId
191 n -> case length n >= 2 of
192 True -> nodeError ManyNodeUsers
193 False -> pure rootId'
194 rootId <- maybe (nodeError NoRootFound) pure (head rootId'')
196 corpusId' <- mkAnnuaire rootId userId
198 corpusId <- maybe (nodeError NoCorpusFound) pure (head corpusId')
200 printDebug "(username, userId, rootId, corpusId)"
201 (username, userId, rootId, corpusId)
202 pure (userId, rootId, corpusId)
204 ------------------------------------------------------------------------
205 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
206 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
208 err = "Database.Flow.toInsert"
210 toInserted :: [ReturnId] -> Map HashId ReturnId
211 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
212 . filter (\r -> reInserted r == True)
214 data DocumentWithId =
215 DocumentWithId { documentId :: !NodeId
216 , documentData :: !HyperdataDocument
219 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
220 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
222 toDocumentWithId (hash,hpd) =
223 DocumentWithId <$> fmap reId (lookup hash rs)
226 ------------------------------------------------------------------------
227 data DocumentIdWithNgrams =
229 { documentWithId :: !DocumentWithId
230 , document_ngrams :: !(Map Ngrams (Map NgramsType Int))
234 extractNgramsT :: HasNodeError err => HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int))
235 extractNgramsT doc = do
236 let source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
237 let institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
238 let authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
239 let leText = catMaybes [_hyperdataDocument_title doc, _hyperdataDocument_abstract doc]
240 terms' <- map text2ngrams <$> map (intercalate " " . _terms_label) <$> concat <$> liftIO (extractTerms (Multi EN) leText)
242 pure $ DM.fromList $ [(source, DM.singleton Sources 1)]
243 <> [(i', DM.singleton Institutes 1) | i' <- institutes ]
244 <> [(a', DM.singleton Authors 1) | a' <- authors ]
245 <> [(t', DM.singleton NgramsTerms 1) | t' <- terms' ]
249 documentIdWithNgrams :: HasNodeError err
250 => (HyperdataDocument -> Cmd err (Map Ngrams (Map NgramsType Int)))
251 -> [DocumentWithId] -> Cmd err [DocumentIdWithNgrams]
252 documentIdWithNgrams f = mapM toDocumentIdWithNgrams
254 toDocumentIdWithNgrams d = do
255 e <- f $ documentData d
256 pure $ DocumentIdWithNgrams d e
258 -- | TODO check optimization
259 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map Ngrams (Map NgramsType (Map NodeId Int))
260 mapNodeIdNgrams = DM.unionsWith (DM.unionWith (DM.unionWith (+))) . fmap f
262 f :: DocumentIdWithNgrams -> Map Ngrams (Map NgramsType (Map NodeId Int))
263 f d = fmap (fmap (DM.singleton nId)) $ document_ngrams d
265 nId = documentId $ documentWithId d
267 ------------------------------------------------------------------------
268 flowList :: HasNodeError err => UserId -> CorpusId
269 -> Map NgramsIndexed (Map NgramsType (Map NodeId Int)) -> Cmd err ListId
270 flowList uId cId ngs = do
271 -- printDebug "ngs:" ngs
272 lId <- getOrMkList cId uId
273 --printDebug "ngs" (DM.keys ngs)
274 -- TODO add stemming equivalence of 2 ngrams
276 -- let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
277 -- _ <- insertGroups lId groupEd
279 -- compute Candidate / Map
280 is <- insertLists lId $ ngrams2list ngs
281 printDebug "listNgrams inserted :" is
285 flowListUser :: HasNodeError err => UserId -> CorpusId -> Int -> Cmd err NodeId
286 flowListUser uId cId n = do
287 lId <- getOrMkList cId uId
288 -- is <- insertLists lId $ ngrams2list ngs
290 ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
291 _ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
295 ------------------------------------------------------------------------
300 * DM.keys called twice
301 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
302 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
303 -> Map NgramsIndexed NgramsIndexed
304 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
308 -- TODO check: do not insert duplicates
309 insertGroups :: HasNodeError err => ListId -> Map NgramsIndexed NgramsIndexed -> Cmd err Int
310 insertGroups lId ngrs =
311 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
312 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
316 ------------------------------------------------------------------------
317 ngrams2list :: Map NgramsIndexed (Map NgramsType a)
318 -> [(ListType, (NgramsType,NgramsIndexed))]
320 [ (CandidateList, (t, ng))
321 | (ng, tm) <- DM.toList m
325 -- | TODO: weight of the list could be a probability
326 insertLists :: HasNodeError err => ListId -> [(ListType, (NgramsType, NgramsIndexed))] -> Cmd err Int
327 insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (ngramsTypeId ngt) (fromIntegral $ listTypeId l) 1
328 | (l,(ngt, ng)) <- lngs
330 ------------------------------------------------------------------------