]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
Merge remote-tracking branch 'origin/445-dev-doc-upload-lang' into dev
[gargantext.git] / src / Gargantext / API / Node / DocumentsFromWriteNodes.hs
1 {-|
2 Module : Gargantext.API.Node.DocumentsFromWriteNodes
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE MonoLocalBinds #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE TypeOperators #-}
15
16 module Gargantext.API.Node.DocumentsFromWriteNodes
17 where
18
19 import Conduit
20 import Control.Lens ((^.))
21 import Data.Aeson
22 import Data.Either (Either(..), rights)
23 import Data.Swagger
24 import qualified Data.Text as T
25 import Gargantext.API.Admin.EnvTypes (Env, GargJob(..))
26 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
27 import Gargantext.API.Admin.Types (HasSettings)
28 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotalWithMessage)
29 import Gargantext.API.Prelude (GargM, GargError)
30 import Gargantext.Core (Lang(..))
31 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite
32 import Gargantext.Core.Text.Terms (TermType(..))
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
35 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
36 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
37 import Gargantext.Database.Admin.Types.Hyperdata.Frame
38 import Gargantext.Database.Admin.Types.Node
39 import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
40 import Gargantext.Database.Schema.Node (node_hyperdata)
41 import qualified Gargantext.Defaults as Defaults
42 import Gargantext.Prelude
43 import Gargantext.Utils.Jobs (serveJobsAPI)
44 import GHC.Generics (Generic)
45 import Servant
46
47 ------------------------------------------------------------------------
48 type API = Summary " Documents from Write nodes."
49 :> AsyncJobs JobLog '[JSON] Params JobLog
50 ------------------------------------------------------------------------
51 newtype Params = Params { id :: Int }
52 deriving (Generic, Show)
53 instance FromJSON Params where
54 parseJSON = genericParseJSON defaultOptions
55 instance ToJSON Params where
56 toJSON = genericToJSON defaultOptions
57 instance ToSchema Params
58 ------------------------------------------------------------------------
59 api :: UserId -> NodeId -> ServerT API (GargM Env GargError)
60 api uId nId =
61 serveJobsAPI DocumentFromWriteNodeJob $ \p log'' ->
62 let
63 log' x = do
64 liftBase $ log'' x
65 in documentsFromWriteNodes uId nId p (liftBase . log')
66
67 documentsFromWriteNodes :: (HasSettings env, FlowCmdM env err m)
68 => UserId
69 -> NodeId
70 -> Params
71 -> (JobLog -> m ())
72 -> m JobLog
73 documentsFromWriteNodes uId nId _p logStatus = do
74 let jobLog = JobLog { _scst_succeeded = Just 1
75 , _scst_failed = Just 0
76 , _scst_remaining = Just 1
77 , _scst_events = Just []
78 }
79 logStatus jobLog
80
81 mcId <- getClosestParentIdByType' nId NodeCorpus
82 cId <- case mcId of
83 Just cId -> pure cId
84 Nothing -> do
85 let msg = T.pack $ "[G.A.N.DFWN] Node has no corpus parent: " <> show nId
86 logStatus $ jobLogFailTotalWithMessage msg jobLog
87 panic msg
88
89 frameWriteIds <- getChildrenByType nId NodeFrameWrite
90
91 -- https://write.frame.gargantext.org/<frame_id>/download
92 frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
93
94 frameWritesWithContents <- liftBase $
95 mapM (\node -> do
96 contents <- getHyperdataFrameContents (node ^. node_hyperdata)
97 pure (node, contents)
98 ) frameWrites
99
100 let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
101 let parsed = rights parsedE
102
103 _ <- flowDataText (RootId (NodeId uId)) (DataNew (Just $ fromIntegral $ length parsed, yieldMany parsed)) (Multi EN) cId Nothing logStatus
104
105 pure $ jobLogSuccess jobLog
106 ------------------------------------------------------------------------
107 hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument
108 hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
109 case parseLines contents of
110 Left _ -> Left "Error parsing node"
111 Right (Parsed { authors, contents = c, date, source, title = t }) ->
112 let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
113 authors' = T.concat $ authorJoinSingle <$> authors
114 date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
115 , T.pack $ show month, "-"
116 , T.pack $ show day ]) <$> date
117 year' = fromIntegral $ maybe Defaults.year (\(Date { year }) -> year) date
118 month' = maybe Defaults.month (\(Date { month }) -> fromIntegral month) date
119 day' = maybe Defaults.day (\(Date { day }) -> fromIntegral day) date in
120 Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
121 , _hd_doi = Nothing
122 , _hd_url = Nothing
123 , _hd_uniqId = Nothing
124 , _hd_uniqIdBdd = Nothing
125 , _hd_page = Nothing
126 , _hd_title = Just t
127 , _hd_authors = Just authors'
128 , _hd_institutes = Nothing
129 , _hd_source = source
130 , _hd_abstract = Just c
131 , _hd_publication_date = date'
132 , _hd_publication_year = Just year'
133 , _hd_publication_month = Just month'
134 , _hd_publication_day = Just day'
135 , _hd_publication_hour = Nothing
136 , _hd_publication_minute = Nothing
137 , _hd_publication_second = Nothing
138 , _hd_language_iso2 = Just $ T.pack $ show EN }