]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
[zip] implement zip for parsers
[gargantext.git] / src / Gargantext / API / Node / DocumentsFromWriteNodes.hs
1 {-|
2 Module : Gargantext.API.Node.DocumentsFromWriteNodes
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE MonoLocalBinds #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE TypeOperators #-}
15
16 module Gargantext.API.Node.DocumentsFromWriteNodes
17 where
18
19 import Conduit
20 import Control.Lens ((^.))
21 import Data.Aeson
22 import Data.Either (Either(..), rights)
23 import Data.Swagger
24 import qualified Data.Text as T
25 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
26 import Gargantext.API.Admin.Types (HasSettings)
27 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotalWithMessage)
28 import Gargantext.API.Prelude (GargServer)
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite
31 import Gargantext.Core.Text.Terms (TermType(..))
32 import Gargantext.Core.Types.Individu (User(..))
33 import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
34 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
35 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
36 import Gargantext.Database.Admin.Types.Hyperdata.Frame
37 import Gargantext.Database.Admin.Types.Node
38 import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
39 import Gargantext.Database.Schema.Node (node_hyperdata)
40 import Gargantext.Prelude
41 import GHC.Generics (Generic)
42 import Servant
43 import Servant.Job.Async (JobFunction(..), serveJobsAPI)
44
45 ------------------------------------------------------------------------
46 type API = Summary " Documents from Write nodes."
47 :> AsyncJobs JobLog '[JSON] Params JobLog
48 ------------------------------------------------------------------------
49 newtype Params = Params { id :: Int }
50 deriving (Generic, Show)
51
52 instance FromJSON Params where
53 parseJSON = genericParseJSON defaultOptions
54 instance ToJSON Params where
55 toJSON = genericToJSON defaultOptions
56 instance ToSchema Params
57 ------------------------------------------------------------------------
58 api :: UserId -> NodeId -> GargServer API
59 api uId nId =
60 serveJobsAPI $
61 JobFunction (\p log'' ->
62 let
63 log' x = do
64 liftBase $ log'' x
65 in documentsFromWriteNodes uId nId p (liftBase . log')
66 )
67
68 documentsFromWriteNodes :: (HasSettings env, FlowCmdM env err m)
69 => UserId
70 -> NodeId
71 -> Params
72 -> (JobLog -> m ())
73 -> m JobLog
74 documentsFromWriteNodes uId nId _p logStatus = do
75 let jobLog = JobLog { _scst_succeeded = Just 1
76 , _scst_failed = Just 0
77 , _scst_remaining = Just 1
78 , _scst_events = Just []
79 }
80 logStatus jobLog
81
82 mcId <- getClosestParentIdByType' nId NodeCorpus
83 cId <- case mcId of
84 Just cId -> pure cId
85 Nothing -> do
86 let msg = T.pack $ "[G.A.N.DFWN] Node has no corpus parent: " <> show nId
87 logStatus $ jobLogFailTotalWithMessage msg jobLog
88 panic msg
89
90 frameWriteIds <- getChildrenByType nId NodeFrameWrite
91
92 -- https://write.frame.gargantext.org/<frame_id>/download
93 frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
94
95 frameWritesWithContents <- liftBase $
96 mapM (\node -> do
97 contents <- getHyperdataFrameContents (node ^. node_hyperdata)
98 pure (node, contents)
99 ) frameWrites
100
101 let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
102 let parsed = rights parsedE
103
104 _ <- flowDataText (RootId (NodeId uId)) (DataNew (Just $ fromIntegral $ length parsed, yieldMany parsed)) (Multi EN) cId Nothing logStatus
105
106 pure $ jobLogSuccess jobLog
107 ------------------------------------------------------------------------
108 hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument
109 hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
110 case parseLines contents of
111 Left _ -> Left "Error parsing node"
112 Right (Parsed { authors, contents = c, date, source, title = t }) ->
113 let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
114 authors' = T.concat $ authorJoinSingle <$> authors
115 date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
116 , T.pack $ show month, "-"
117 , T.pack $ show day ]) <$> date
118 year' = fromIntegral $ maybe 2021 (\(Date { year }) -> year) date
119 month' = fromIntegral $ maybe 10 (\(Date { month }) -> month) date
120 day' = fromIntegral $ maybe 4 (\(Date { day }) -> day) date in
121 Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
122 , _hd_doi = Nothing
123 , _hd_url = Nothing
124 , _hd_uniqId = Nothing
125 , _hd_uniqIdBdd = Nothing
126 , _hd_page = Nothing
127 , _hd_title = Just t
128 , _hd_authors = Just authors'
129 , _hd_institutes = Nothing
130 , _hd_source = source
131 , _hd_abstract = Just c
132 , _hd_publication_date = date'
133 , _hd_publication_year = Just year'
134 , _hd_publication_month = Just month'
135 , _hd_publication_day = Just day'
136 , _hd_publication_hour = Nothing
137 , _hd_publication_minute = Nothing
138 , _hd_publication_second = Nothing
139 , _hd_language_iso2 = Just $ T.pack $ show EN }