2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE DeriveGeneric #-}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Database.Flow (flowDatabase, ngrams2list)
20 import GHC.Show (Show)
21 import System.FilePath (FilePath)
22 import Data.Maybe (Maybe(..), catMaybes)
23 import Data.Text (Text, splitOn)
24 import Data.Map (Map, lookup)
25 import Data.Tuple.Extra (both, second)
26 import qualified Data.Map as DM
28 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
29 import Gargantext.Database.Bashql (runCmd') -- , del)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard, mkAnnuaire)
32 import Gargantext.Database.Node.Document.Add (add)
33 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
34 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
35 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Database.User (getUser, UserLight(..), Username)
38 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
39 import Gargantext.Prelude
40 import Gargantext.Text.Parsers (parseDocs, FileFormat)
41 import Gargantext.Ext.IMT (toSchoolName)
47 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO Int
48 flowDatabase ff fp cName = do
51 (masterUserId, _, corpusId) <- subFlowCorpus userMaster corpusMasterName
54 hyperdataDocuments <- map addUniqIds <$> parseDocs ff fp
56 printDebug "hyperdataDocuments" hyperdataDocuments
58 ids <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
59 --printDebug "Docs IDs : " (ids)
60 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId hyperdataDocuments
61 printDebug "Repeated Docs IDs : " (length idsRepeat)
64 -- todo: flow for new documents only
65 let tids = toInserted ids
66 printDebug "toInserted ids" (length tids, tids)
68 let tihs = toInsert hyperdataDocuments
69 printDebug "toInsert hyperdataDocuments" (length tihs, tihs)
71 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
72 printDebug "documentsWithId" documentsWithId
74 -- docsWithNgrams <- documentIdWithNgrams documentsWithId extractNgramsT
75 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
76 printDebug "docsWithNgrams" docsWithNgrams
78 let maps = mapNodeIdNgrams docsWithNgrams
79 printDebug "maps" (maps)
81 indexedNgrams <- runCmd' $ indexNgrams maps
82 printDebug "inserted ngrams" indexedNgrams
83 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
86 listId2 <- runCmd' $ listFlow masterUserId corpusId indexedNgrams
87 printDebug "list id : " listId2
89 (userId, rootUserId, corpusId2) <- subFlowCorpus userArbitrary cName
91 userListId <- runCmd' $ listFlowUser userId corpusId2
92 printDebug "UserList : " userListId
94 inserted <- runCmd' $ add corpusId2 (map reId ids)
95 printDebug "Inserted : " (length inserted)
97 _ <- runCmd' $ mkDashboard corpusId2 userId
98 _ <- runCmd' $ mkGraph corpusId2 userId
101 annuaireId <- runCmd' $ mkAnnuaire rootUserId userId
104 -- runCmd' $ del [corpusId2, corpusId]
106 type CorpusName = Text
108 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
109 subFlowCorpus username cName = do
110 maybeUserId <- runCmd' (getUser username)
112 let userId = case maybeUserId of
113 Nothing -> panic "Error: User does not exist (yet)"
114 -- mk NodeUser gargantua_id "Node Gargantua"
115 Just user -> userLight_id user
117 rootId' <- map _node_id <$> runCmd' (getRoot userId)
119 rootId'' <- case rootId' of
120 [] -> runCmd' (mkRoot username userId)
121 n -> case length n >= 2 of
122 True -> panic "Error: more than 1 userNode / user"
123 False -> pure rootId'
124 let rootId = maybe (panic "error rootId") identity (head rootId'')
126 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
127 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
129 printDebug "(username, userId, rootId, corpusId)"
130 (username, userId, rootId, corpusId)
131 pure (userId, rootId, corpusId)
133 ------------------------------------------------------------------------
139 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
140 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqId d), d))
142 hash = maybe "Error" identity
144 toInserted :: [ReturnId] -> Map HashId ReturnId
145 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
146 $ filter (\r -> reInserted r == True) rs
148 data DocumentWithId =
149 DocumentWithId { documentId :: NodeId
150 , documentData :: HyperdataDocument
153 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
154 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
156 toDocumentWithId (hash,hpd) =
157 DocumentWithId <$> fmap reId (lookup hash rs)
160 ------------------------------------------------------------------------
162 data DocumentIdWithNgrams =
164 { documentWithId :: DocumentWithId
165 , document_ngrams :: Map (NgramsT Ngrams) Int
168 -- TODO add Terms (Title + Abstract)
169 -- add f :: Text -> Text
170 -- newtype Ngrams = Ngrams Text
171 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
172 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
173 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
174 <> [(NgramsT Authors a' , 1)| a' <- authors ]
176 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
177 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
178 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
181 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
182 -> [DocumentWithId] -> [DocumentIdWithNgrams]
183 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
185 -- | TODO check optimization
186 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
187 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
189 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
190 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
192 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
193 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
194 indexNgrams ng2nId = do
195 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
196 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
199 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
200 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
201 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
202 | (ng, nId2int) <- DM.toList m
203 , (nId, n) <- DM.toList nId2int
207 ------------------------------------------------------------------------
208 ------------------------------------------------------------------------
209 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
210 listFlow uId cId ngs = do
211 printDebug "ngs:" ngs
212 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
213 printDebug "ngs" (DM.keys ngs)
214 -- TODO add stemming equivalence of 2 ngrams
215 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
216 _ <- insertGroups lId groupEd
218 -- compute Candidate / Map
219 let lists = ngrams2list ngs
220 printDebug "lists:" lists
222 is <- insertLists lId lists
223 printDebug "listNgrams inserted :" is
227 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
228 listFlowUser uId cId = mkList cId uId
230 ------------------------------------------------------------------------
232 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
233 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
234 -> Map NgramsIndexed NgramsIndexed
235 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
239 -- TODO check: do not insert duplicates
240 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
241 insertGroups lId ngrs =
242 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
243 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
247 ------------------------------------------------------------------------
248 -- TODO: verify NgramsT lost here
249 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
250 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
252 -- | TODO: weight of the list could be a probability
253 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
254 insertLists lId lngs =
255 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
256 | (l,ngr) <- map (second _ngramsId) lngs
259 ------------------------------------------------------------------------
260 ------------------------------------------------------------------------