2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
19 import GHC.Show (Show)
20 --import Control.Lens (view)
21 import System.FilePath (FilePath)
22 import Data.Maybe (Maybe(..), catMaybes)
23 import Data.Text (Text, splitOn)
24 import Data.Map (Map, lookup)
25 import Data.Tuple.Extra (both, second)
26 import qualified Data.Map as DM
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Schema.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Schema.Node (mkRoot, mkCorpus, mkList, mkGraph, mkDashboard, mkAnnuaire, getCorporaWithParentId')
32 import Gargantext.Database.Root (getRootCmd)
33 import Gargantext.Database.Types.Node (NodeType(..), NodeId)
34 import Gargantext.Database.Node.Document.Add (add)
35 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
36 import Gargantext.Database.Schema.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
37 import Gargantext.Database.Schema.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
38 import Gargantext.Database.Types.Node (HyperdataDocument(..))
39 import Gargantext.Database.Utils (Cmd(..))
40 --import Gargantext.Database.Node.Contact (HyperdataContact(..))
41 import Gargantext.Database.Schema.User (getUser, UserLight(..))
42 import Gargantext.Core.Types.Individu (Username)
43 import Gargantext.Ext.IMT (toSchoolName)
44 import Gargantext.Ext.IMTUser (deserialiseImtUsersFromFile)
45 import Gargantext.Prelude
46 import Gargantext.Text.Parsers (parseDocs, FileFormat)
47 import Gargantext.Core.Types.Main
48 --import Gargantext.Core.Types
49 import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
51 flowCorpus :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
52 flowCorpus ff fp cName = do
53 hyperdataDocuments' <- map addUniqIdsDoc <$> parseDocs ff fp
54 params <- flowInsert NodeCorpus hyperdataDocuments' cName
55 flowCorpus' NodeCorpus hyperdataDocuments' params
58 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
59 -> IO ([ReturnId], MasterUserId, MasterCorpusId, UserId, CorpusId)
60 flowInsert _nt hyperdataDocuments cName = do
61 let hyperdataDocuments' = map (\h -> ToDbDocument h) hyperdataDocuments
63 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
64 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeDocument hyperdataDocuments'
66 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
67 _ <- runCmd' $ add userCorpusId (map reId ids)
69 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
72 flowAnnuaire :: FilePath -> IO ()
73 flowAnnuaire filePath = do
74 contacts <- deserialiseImtUsersFromFile filePath
75 ps <- flowInsertAnnuaire "Annuaire" $ map (\h-> ToDbContact h) $ map addUniqIdsContact contacts
76 printDebug "length annuaire" ps
79 flowInsertAnnuaire :: CorpusName -> [ToDbData]
80 -> IO ([ReturnId], UserId, CorpusId, UserId, CorpusId)
81 flowInsertAnnuaire name children = do
83 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
84 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId NodeContact children
86 (userId, _, userCorpusId) <- subFlowAnnuaire userArbitrary name
87 _ <- runCmd' $ add userCorpusId (map reId ids)
89 printDebug "AnnuaireID" userCorpusId
91 pure (ids, masterUserId, masterCorpusId, userId, userCorpusId)
94 flowCorpus' :: NodeType -> [HyperdataDocument]
95 -> ([ReturnId], UserId, CorpusId, UserId, CorpusId)
97 flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
98 --------------------------------------------------
100 userListId <- runCmd' $ flowListUser userId userCorpusId
101 printDebug "Working on User ListId : " userListId
103 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
104 -- printDebug "documentsWithId" documentsWithId
105 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
106 -- printDebug "docsWithNgrams" docsWithNgrams
107 let maps = mapNodeIdNgrams docsWithNgrams
109 -- printDebug "maps" (maps)
110 indexedNgrams <- runCmd' $ indexNgrams maps
111 -- printDebug "inserted ngrams" indexedNgrams
112 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
114 listId2 <- runCmd' $ flowList masterUserId masterCorpusId indexedNgrams
115 printDebug "Working on ListId : " listId2
117 --------------------------------------------------
118 _ <- runCmd' $ mkDashboard userCorpusId userId
119 _ <- runCmd' $ mkGraph userCorpusId userId
122 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
125 -- runCmd' $ del [corpusId2, corpusId]
127 flowCorpus' NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_masterCorpusId,_userId,_userCorpusId) = undefined
128 flowCorpus' _ _ _ = undefined
131 type CorpusName = Text
133 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
134 subFlowCorpus username cName = do
135 maybeUserId <- runCmd' (getUser username)
137 let userId = case maybeUserId of
138 Nothing -> panic "Error: User does not exist (yet)"
139 -- mk NodeUser gargantua_id "Node Gargantua"
140 Just user -> userLight_id user
142 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
144 rootId'' <- case rootId' of
145 [] -> runCmd' (mkRoot username userId)
146 n -> case length n >= 2 of
147 True -> panic "Error: more than 1 userNode / user"
148 False -> pure rootId'
149 let rootId = maybe (panic "error rootId") identity (head rootId'')
151 corpusId'' <- if username == userMaster
153 ns <- runCmd' $ getCorporaWithParentId' rootId
154 pure $ map _node_id ns
159 corpusId' <- if corpusId'' /= []
161 else runCmd' $ mkCorpus (Just cName) Nothing rootId userId
163 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
165 printDebug "(username, userId, rootId, corpusId)"
166 (username, userId, rootId, corpusId)
167 pure (userId, rootId, corpusId)
170 subFlowAnnuaire :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
171 subFlowAnnuaire username _cName = do
172 maybeUserId <- runCmd' (getUser username)
174 let userId = case maybeUserId of
175 Nothing -> panic "Error: User does not exist (yet)"
176 -- mk NodeUser gargantua_id "Node Gargantua"
177 Just user -> userLight_id user
179 rootId' <- map _node_id <$> runCmd' (getRootCmd username)
181 rootId'' <- case rootId' of
182 [] -> runCmd' (mkRoot username userId)
183 n -> case length n >= 2 of
184 True -> panic "Error: more than 1 userNode / user"
185 False -> pure rootId'
186 let rootId = maybe (panic "error rootId") identity (head rootId'')
188 corpusId' <- runCmd' $ mkAnnuaire rootId userId
190 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
192 printDebug "(username, userId, rootId, corpusId)"
193 (username, userId, rootId, corpusId)
194 pure (userId, rootId, corpusId)
198 ------------------------------------------------------------------------
199 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
200 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
202 err = "Database.Flow.toInsert"
204 toInserted :: [ReturnId] -> Map HashId ReturnId
205 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
206 . filter (\r -> reInserted r == True)
208 data DocumentWithId =
209 DocumentWithId { documentId :: NodeId
210 , documentData :: HyperdataDocument
213 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
214 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
216 toDocumentWithId (hash,hpd) =
217 DocumentWithId <$> fmap reId (lookup hash rs)
220 ------------------------------------------------------------------------
222 data DocumentIdWithNgrams =
224 { documentWithId :: DocumentWithId
225 , document_ngrams :: Map (NgramsT Ngrams) Int
228 -- TODO add Terms (Title + Abstract)
229 -- add f :: Text -> Text
230 -- newtype Ngrams = Ngrams Text
231 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
232 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
233 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
234 <> [(NgramsT Authors a' , 1)| a' <- authors ]
236 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
237 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
238 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
244 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
245 -> [DocumentWithId] -> [DocumentIdWithNgrams]
246 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
248 -- | TODO check optimization
249 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
250 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
252 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
253 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
255 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
256 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
257 indexNgrams ng2nId = do
258 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
259 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
262 ------------------------------------------------------------------------
263 ------------------------------------------------------------------------
264 flowList :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
265 flowList uId cId ngs = do
266 -- printDebug "ngs:" ngs
267 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
268 --printDebug "ngs" (DM.keys ngs)
269 -- TODO add stemming equivalence of 2 ngrams
270 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
271 _ <- insertGroups lId groupEd
273 -- compute Candidate / Map
274 let lists = ngrams2list ngs
275 -- printDebug "lists:" lists
277 is <- insertLists lId lists
278 printDebug "listNgrams inserted :" is
282 flowListUser :: UserId -> CorpusId -> Cmd [Int]
283 flowListUser uId cId = mkList cId uId
285 ------------------------------------------------------------------------
287 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
288 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
289 -> Map NgramsIndexed NgramsIndexed
290 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
294 -- TODO check: do not insert duplicates
295 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
296 insertGroups lId ngrs =
297 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
298 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
302 ------------------------------------------------------------------------
303 -- TODO: verify NgramsT lost here
304 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
305 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
307 -- | TODO: weight of the list could be a probability
308 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
309 insertLists lId lngs =
310 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
311 | (l,ngr) <- map (second _ngramsId) lngs
314 ------------------------------------------------------------------------
315 ------------------------------------------------------------------------