]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Flow.hs
Merge branch 'master' into dbflow
[gargantext.git] / src / Gargantext / Database / Flow.hs
1 {-|
2 Module : Gargantext.Database.Flow
3 Description : Database Flow
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 Map (NgramsId, NodeId) -> insert
12 data NgramsType = Sources | Authors | Terms
13 nodes_ngrams : column type, column list
14
15 documents
16 sources
17 authors
18
19 -}
20
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE NoImplicitPrelude #-}
23 {-# LANGUAGE OverloadedStrings #-}
24
25 module Gargantext.Database.Flow
26 where
27 import System.FilePath (FilePath)
28 import Data.Maybe (Maybe(..))
29 import Data.Text (Text, unpack)
30 import Data.Map (Map)
31 import qualified Data.Map as DM
32 import GHC.Generics (Generic)
33
34 import Gargantext.Core.Types (NodePoly(..))
35 import Gargantext.Prelude
36 import Gargantext.Database.Bashql (runCmd', del)
37 import Gargantext.Database.Types.Node (Node(..), HyperdataDocument(..))
38 import Gargantext.Database.Node (getRoot, mkRoot, mkCorpus, Cmd(..))
39 import Gargantext.Database.User (getUser, UserLight(..), Username)
40 import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIds)
41 import Gargantext.Database.Node.Document.Add (add)
42 import Gargantext.Database.NodeNgram (NodeNgramPoly(..), insertNodeNgrams)
43 import Gargantext.Text.Parsers (parseDocs, FileFormat(WOS))
44 import Gargantext.Database.Ngram (insertNgrams, Ngrams(..), NgramsT(..), NgramsIndexed(..), indexNgramsT, ngramsTypeId)
45
46 type UserId = Int
47 type RootId = Int
48 type CorpusId = Int
49
50 subFlow :: Username -> IO (UserId, RootId, CorpusId)
51 subFlow username = do
52 maybeUserId <- runCmd' (getUser username)
53
54 let userId = case maybeUserId of
55 Nothing -> panic "Error: User does not exist (yet)"
56 -- mk NodeUser gargantua_id "Node Gargantua"
57 Just user -> userLight_id user
58
59 rootId' <- map _node_id <$> runCmd' (getRoot userId)
60
61 rootId'' <- case rootId' of
62 [] -> runCmd' (mkRoot userId)
63 un -> case length un >= 2 of
64 True -> panic "Error: more than 1 userNode / user"
65 False -> pure rootId'
66 let rootId = maybe (panic "error rootId") identity (head rootId'')
67
68 corpusId' <- runCmd' $ mkCorpus (Just "Corpus WOS") Nothing rootId userId
69 let corpusId = maybe (panic "error corpusId") identity (head corpusId')
70
71 printDebug "(username, userId, rootId, corpusId"
72 (username, userId, rootId, corpusId)
73 pure (userId, rootId, corpusId)
74
75
76 flow :: FilePath -> IO Int
77 flow fp = do
78
79 (masterUserId, _, corpusId) <- subFlow "gargantua"
80
81 docs <- map addUniqIds <$> parseDocs WOS fp
82 ids <- runCmd' $ insertDocuments masterUserId corpusId docs
83 printDebug "Docs IDs : " ids
84
85 idsRepeat <- runCmd' $ insertDocuments masterUserId corpusId docs
86 printDebug "Docs IDs : " idsRepeat
87
88 (_, _, corpusId2) <- subFlow "alexandre"
89
90 inserted <- runCmd' $ add corpusId2 (map reId ids)
91 printDebug "Inserted : " inserted
92
93 runCmd' $ del [corpusId2, corpusId]
94
95 ----------------------------------------------------------------
96 type HashId = Text
97 type NodeId = Int
98 type ToInsert = Map HashId HyperdataDocument
99 type Inserted = Map HashId ReturnId
100
101 toInsert :: [HyperdataDocument] -> Map HashId HyperdataDocument
102 toInsert = DM.fromList . map (\d -> (hash (_hyperdataDocument_uniqIdBdd d), d))
103 where
104 hash = maybe "Error" identity
105
106 toInserted :: [ReturnId] -> Map HashId ReturnId
107 toInserted rs = DM.fromList $ map (\r -> (reUniqId r, r) )
108 $ filter (\r -> reInserted r == True) rs
109
110 data DocumentWithId = DocumentWithId { documentId :: NodeId
111 , documentData :: HyperdataDocument
112 }
113
114
115 mergeData :: Map HashId ReturnId -> Map HashId HyperdataDocument -> [DocumentWithId]
116 mergeData rs hs = map (\(hash,r) -> DocumentWithId (reId r) (lookup' hash hs)) $ DM.toList rs
117 where
118 lookup' h xs = maybe (panic $ "Error with " <> h) identity (DM.lookup h xs)
119
120 data DocumentIdWithNgrams = DocumentIdWithNgrams { documentWithId :: DocumentWithId
121 , document_ngrams :: Map (NgramsT Ngrams)Int
122 }
123
124
125 documentIdWithNgrams :: (HyperdataDocument -> Map (NgramsT Ngrams) Int)
126 -> [DocumentWithId] -> [DocumentIdWithNgrams]
127 documentIdWithNgrams f = map (\d -> DocumentIdWithNgrams d ((f . documentData) d))
128
129 -- | TODO check optimization
130 mapNodeIdNgrams :: [DocumentIdWithNgrams] -> Map (NgramsT Ngrams) (Map NodeId Int)
131 mapNodeIdNgrams ds = DM.map (DM.fromListWith (+)) $ DM.fromListWith (<>) xs
132 where
133 xs = [(ng, [(nId, i)]) | (nId, n2i') <- n2i ds, (ng, i) <- DM.toList n2i']
134 n2i = map (\d -> ((documentId . documentWithId) d, document_ngrams d))
135
136 indexNgrams :: Map (NgramsT Ngrams ) (Map NodeId Int)
137 -> Cmd (Map (NgramsT NgramsIndexed) (Map NodeId Int))
138 indexNgrams ng2nId = do
139 terms2id <- insertNgrams (map _ngramsT $ DM.keys ng2nId)
140 pure $ DM.mapKeys (indexNgramsT terms2id) ng2nId
141
142
143 insertToNodeNgrams :: Map (NgramsT NgramsIndexed) (Map NodeId Int) -> Cmd Int
144 insertToNodeNgrams m = insertNodeNgrams $ [ NodeNgram Nothing nId ((_ngramsId . _ngramsT ) ng)
145 (fromIntegral n) ((ngramsTypeId . _ngramsType) ng)
146
147 | (ng, nId2int) <- DM.toList m
148 , (nId, n) <- DM.toList nId2int
149 ]
150
151 -- mk ListGroup
152 -- groupBy fun
153 -- insertInto NodeNgramsNgrams
154
155 -- compute Candidate / Map
156 -- add column typelist
157 -- insertNodeNodeNgram
158
159 -- get data of NgramsTable
160 -- post :: update NodeNodeNgrams
161 -- group ngrams
162