]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
[Annuaire] flow to insert in DB.
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE DeriveGeneric #-}
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Database.Flow -- (flowDatabase, ngrams2list)
17 where
18
19 import GHC.Show (Show)
20 import System.FilePath (FilePath)
21 import Data.Maybe (Maybe(..), catMaybes)
22 import Data.Text (Text, splitOn)
23 import Data.Map (Map, lookup)
24 import Data.Tuple.Extra (both, second)
25 import qualified Data.Map as DM
26
27 import Gargantext.Core.Types (NodePoly(..), ListType(..), listTypeId)
28 import Gargantext.Database.Bashql (runCmd') -- , del)
29 import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
30 import Gargantext.Database.Ngrams (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId, NgramsType(..), text2ngrams)
31 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..), mkList, mkGraph, mkDashboard)--, mkAnnuaire)
32 import Gargantext.Database.Types.Node (NodeType(..))
33 import Gargantext.Database.Node.Document.Add (add)
34 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
35 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
36 import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNodeNgramsNgramsNew)
37 import Gargantext.Database.Types.Node (HyperdataDocument(..))
38 import Gargantext.Database.Node.Contact (HyperdataContact(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Ext.IMT (toSchoolName)
41 import Gargantext.Prelude
42 import Gargantext.Text.Parsers (parseDocs, FileFormat)
43
44 type UserId = Int
45 type MasterUserId = Int
46
47 type RootId = Int
48 type CorpusId = Int
49
50
51 flowDatabase :: FileFormat -> FilePath -> CorpusName -> IO CorpusId
52 flowDatabase ff fp cName = do
53 -- Corpus Flow
54 hyperdataDocuments <- map addUniqIdsDoc <$> parseDocs ff fp
55 params <- flowInsert NodeCorpus hyperdataDocuments cName
56 flowCorpus NodeCorpus hyperdataDocuments params
57
58
59 flowInsert :: NodeType -> [HyperdataDocument] -> CorpusName
60 -> IO ([ReturnId], MasterUserId, UserId, CorpusId)
61 flowInsert nt hyperdataDocuments cName = do
62 let hyperdataDocuments' = case nt of
63 NodeCorpus -> map (\h -> ToDbDocument h) hyperdataDocuments
64 -- NodeAnnuaire -> map (\h -> ToDbContact h) hyperdataDocuments
65
66 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
67 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId hyperdataDocuments'
68
69 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary cName
70 _ <- runCmd' $ add userCorpusId (map reId ids)
71
72 pure (ids, masterUserId, userId, userCorpusId)
73
74 --{-
75 flowInsertAnnuaire children name = do
76
77 (masterUserId, _, masterCorpusId) <- subFlowCorpus userMaster corpusMasterName
78 ids <- runCmd' $ insertDocuments masterUserId masterCorpusId children
79
80 (userId, _, userCorpusId) <- subFlowCorpus userArbitrary name
81 _ <- runCmd' $ add userCorpusId (map reId ids)
82
83 pure (ids, masterUserId, userId, userCorpusId)
84
85
86 --}
87
88 --{-
89 -- flowCorpus :: NodeType -> [HyperdataDocument] -> ([ReturnId],MasterUserId,UserId,CorpusId) -> IO CorpusId
90 flowCorpus NodeCorpus hyperdataDocuments (ids,masterUserId,userId,userCorpusId) = do
91 --}
92 --------------------------------------------------
93 -- List Ngrams Flow
94 userListId <- runCmd' $ listFlowUser userId userCorpusId
95 printDebug "Working on User ListId : " userListId
96
97 let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
98 -- printDebug "documentsWithId" documentsWithId
99 let docsWithNgrams = documentIdWithNgrams extractNgramsT documentsWithId
100 -- printDebug "docsWithNgrams" docsWithNgrams
101 let maps = mapNodeIdNgrams docsWithNgrams
102
103 -- printDebug "maps" (maps)
104 indexedNgrams <- runCmd' $ indexNgrams maps
105 -- printDebug "inserted ngrams" indexedNgrams
106 _ <- runCmd' $ insertToNodeNgrams indexedNgrams
107
108 listId2 <- runCmd' $ listFlow masterUserId userCorpusId indexedNgrams
109 printDebug "Working on ListId : " listId2
110 --}
111
112 --------------------------------------------------
113 _ <- runCmd' $ mkDashboard userCorpusId userId
114 _ <- runCmd' $ mkGraph userCorpusId userId
115
116 -- Annuaire Flow
117 -- _ <- runCmd' $ mkAnnuaire rootUserId userId
118
119 pure userCorpusId
120 -- runCmd' $ del [corpusId2, corpusId]
121
122 flowCorpus NodeAnnuaire _hyperdataDocuments (_ids,_masterUserId,_userId,_userCorpusId) = undefined
123 flowCorpus _ _ _ = undefined
124
125 type CorpusName = Text
126
127 subFlowCorpus :: Username -> CorpusName -> IO (UserId, RootId, CorpusId)
128 subFlowCorpus username cName = do
129 maybeUserId <- runCmd' (getUser username)
130
131 let userId = case maybeUserId of
132 Nothing -> panic "Error: User does not exist (yet)"
133 -- mk NodeUser gargantua_id "Node Gargantua"
134 Just user -> userLight_id user
135
136 rootId' <- map _node_id <$> runCmd' (getRoot userId)
137
138 rootId'' <- case rootId' of
139 [] -> runCmd' (mkRoot username userId)
140 n -> case length n >= 2 of
141 True -> panic "Error: more than 1 userNode / user"
142 False -> pure rootId'
143 let rootId = maybe (panic "error rootId") identity (head rootId'')
144
145 corpusId' <- runCmd' $ mkCorpus (Just cName) Nothing rootId userId
146 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
147
148 printDebug "(username, userId, rootId, corpusId)"
149 (username, userId, rootId, corpusId)
150 pure (userId, rootId, corpusId)
151
152 ------------------------------------------------------------------------
153
154 type HashId = Text
155 type NodeId = Int
156 type ListId = Int
157
158 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
159 toInsert = DM.fromList . map (\d -> (maybe err identity (_hyperdataDocument_uniqId d), d))
160 where
161 err = "Database.Flow.toInsert"
162
163 toInserted :: [ReturnId] -> Map HashId ReturnId
164 toInserted = DM.fromList . map (\r -> (reUniqId r, r) )
165 . filter (\r -> reInserted r == True)
166
167 data DocumentWithId =
168 DocumentWithId { documentId :: NodeId
169 , documentData :: HyperdataDocument
170 } deriving (Show)
171
172 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
173 mergeData rs = catMaybes . map toDocumentWithId . DM.toList
174 where
175 toDocumentWithId (hash,hpd) =
176 DocumentWithId <$> fmap reId (lookup hash rs)
177 <*> Just hpd
178
179 ------------------------------------------------------------------------
180
181 data DocumentIdWithNgrams =
182 DocumentIdWithNgrams
183 { documentWithId :: DocumentWithId
184 , document_ngrams :: Map (NgramsT Ngrams) Int
185 } deriving (Show)
186
187 -- TODO add Terms (Title + Abstract)
188 -- add f :: Text -> Text
189 -- newtype Ngrams = Ngrams Text
190 extractNgramsT :: HyperdataDocument -> Map (NgramsT Ngrams) Int
191 extractNgramsT doc = DM.fromList $ [(NgramsT Sources source, 1)]
192 <> [(NgramsT Institutes i' , 1)| i' <- institutes ]
193 <> [(NgramsT Authors a' , 1)| a' <- authors ]
194 where
195 source = text2ngrams $ maybe "Nothing" identity $ _hyperdataDocument_source doc
196 institutes = map text2ngrams $ maybe ["Nothing"] (map toSchoolName . (splitOn ", ")) $ _hyperdataDocument_institutes doc
197 authors = map text2ngrams $ maybe ["Nothing"] (splitOn ", ") $ _hyperdataDocument_authors doc
198 -- TODO group terms
199
200
201
202
203 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
204 -> [DocumentWithId] -> [DocumentIdWithNgrams]
205 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
206
207 -- | TODO check optimization
208 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
209 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
210 where
211 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
212 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
213
214 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
215 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
216 indexNgrams ng2nId = do
217 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
218 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
219
220
221 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
222 insertToNodeNgrams m = insertNodeNgrams [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
223 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
224 | (ng, nId2int) <- DM.toList m
225 , (nId, n) <- DM.toList nId2int
226 ]
227
228
229 ------------------------------------------------------------------------
230 ------------------------------------------------------------------------
231 listFlow :: UserId -> CorpusId -> Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd ListId
232 listFlow uId cId ngs = do
233 -- printDebug "ngs:" ngs
234 lId <- maybe (panic "mkList error") identity <$> head <$> mkList cId uId
235 --printDebug "ngs" (DM.keys ngs)
236 -- TODO add stemming equivalence of 2 ngrams
237 let groupEd = groupNgramsBy (\(NgramsT t1 n1) (NgramsT t2 n2) -> if (((==) t1 t2) && ((==) n1 n2)) then (Just (n1,n2)) else Nothing) ngs
238 _ <- insertGroups lId groupEd
239
240 -- compute Candidate / Map
241 let lists = ngrams2list ngs
242 -- printDebug "lists:" lists
243
244 is <- insertLists lId lists
245 printDebug "listNgrams inserted :" is
246
247 pure lId
248
249 listFlowUser :: UserId -> CorpusId -> Cmd [Int]
250 listFlowUser uId cId = mkList cId uId
251
252 ------------------------------------------------------------------------
253
254 groupNgramsBy :: (NgramsT NgramsIndexed -> NgramsT NgramsIndexed -> Maybe (NgramsIndexed, NgramsIndexed))
255 -> Map (NgramsT NgramsIndexed) (Map NodeId Int)
256 -> Map NgramsIndexed NgramsIndexed
257 groupNgramsBy isEqual cId = DM.fromList $ catMaybes [ isEqual n1 n2 | n1 <- DM.keys cId, n2 <- DM.keys cId]
258
259
260
261 -- TODO check: do not insert duplicates
262 insertGroups :: ListId -> Map NgramsIndexed NgramsIndexed -> Cmd Int
263 insertGroups lId ngrs =
264 insertNodeNgramsNgramsNew [ NodeNgramsNgrams lId ng1 ng2 (Just 1)
265 | (ng1, ng2) <- map (both _ngramsId) $ DM.toList ngrs
266 , ng1 /= ng2
267 ]
268
269 ------------------------------------------------------------------------
270 -- TODO: verify NgramsT lost here
271 ngrams2list :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> [(ListType,NgramsIndexed)]
272 ngrams2list = zip (repeat CandidateList) . map (\(NgramsT _lost_t ng) -> ng) . DM.keys
273
274 -- | TODO: weight of the list could be a probability
275 insertLists :: ListId -> [(ListType,NgramsIndexed)] -> Cmd Int
276 insertLists lId lngs =
277 insertNodeNgrams [ NodeNgram Nothing lId ngr (fromIntegral $ listTypeId l) (listTypeId l)
278 | (l,ngr) <- map (second _ngramsId) lngs
279 ]
280
281 ------------------------------------------------------------------------
282 ------------------------------------------------------------------------
283