]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/DocumentsFromWriteNodes.hs
[upload zip] some more work on zipfile parsing
[gargantext.git] / src / Gargantext / API / Node / DocumentsFromWriteNodes.hs
1 {-|
2 Module : Gargantext.API.Node.DocumentsFromWriteNodes
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE MonoLocalBinds #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE TypeOperators #-}
15 {-# OPTIONS_GHC -fno-warn-orphans #-}
16
17 module Gargantext.API.Node.DocumentsFromWriteNodes
18 where
19
20 import Control.Lens ((^.))
21 import Data.Aeson
22 import Data.Either (Either(..), rights)
23 import Data.Swagger
24 import qualified Data.Text as T
25 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
26 import Gargantext.API.Admin.Types (HasSettings)
27 import Gargantext.API.Prelude (GargServer)
28 import Gargantext.Core (Lang(..))
29 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite
30 import Gargantext.Core.Text.Terms (TermType(..))
31 import Gargantext.Core.Types.Individu (User(..))
32 import Gargantext.Database.Action.Flow (flowDataText, DataText(..))
33 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
34 import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
35 import Gargantext.Database.Admin.Types.Hyperdata.Frame
36 import Gargantext.Database.Admin.Types.Node
37 import Gargantext.Database.Query.Table.Node (getChildrenByType, getClosestParentIdByType', getNodeWith)
38 import Gargantext.Database.Schema.Node (node_hyperdata)
39 import Gargantext.Prelude
40 import GHC.Generics (Generic)
41 import Servant
42 import Servant.Job.Async (JobFunction(..), serveJobsAPI)
43
44 ------------------------------------------------------------------------
45 type API = Summary " Documents from Write nodes."
46 :> AsyncJobs JobLog '[JSON] Params JobLog
47 ------------------------------------------------------------------------
48 newtype Params = Params { id :: Int }
49 deriving (Generic, Show)
50
51 instance FromJSON Params where
52 parseJSON = genericParseJSON defaultOptions
53 instance ToJSON Params where
54 toJSON = genericToJSON defaultOptions
55 instance ToSchema Params
56 ------------------------------------------------------------------------
57 api :: UserId -> NodeId -> GargServer API
58 api uId nId =
59 serveJobsAPI $
60 JobFunction (\p log'' ->
61 let
62 log' x = do
63 liftBase $ log'' x
64 in documentsFromWriteNodes uId nId p (liftBase . log')
65 )
66
67 documentsFromWriteNodes :: (HasSettings env, FlowCmdM env err m)
68 => UserId
69 -> NodeId
70 -> Params
71 -> (JobLog -> m ())
72 -> m JobLog
73 documentsFromWriteNodes uId nId _p logStatus = do
74
75 logStatus JobLog { _scst_succeeded = Just 1
76 , _scst_failed = Just 0
77 , _scst_remaining = Just 1
78 , _scst_events = Just []
79 }
80
81 mcId <- getClosestParentIdByType' nId NodeCorpus
82 let cId = maybe (panic "[G.A.N.DFWN] Node has no parent") identity mcId
83
84 frameWriteIds <- getChildrenByType nId NodeFrameWrite
85
86 -- https://write.frame.gargantext.org/<frame_id>/download
87 frameWrites <- mapM (\id -> getNodeWith id (Proxy :: Proxy HyperdataFrame)) frameWriteIds
88
89 frameWritesWithContents <- liftBase $
90 mapM (\node -> do
91 contents <- getHyperdataFrameContents (node ^. node_hyperdata)
92 pure (node, contents)
93 ) frameWrites
94
95 let parsedE = (\(node, contents) -> hyperdataDocumentFromFrameWrite (node ^. node_hyperdata, contents)) <$> frameWritesWithContents
96 let parsed = rights parsedE
97
98 _ <- flowDataText (RootId (NodeId uId)) (DataNew [parsed]) (Multi EN) cId Nothing
99
100 pure JobLog { _scst_succeeded = Just 2
101 , _scst_failed = Just 0
102 , _scst_remaining = Just 0
103 , _scst_events = Just []
104 }
105 ------------------------------------------------------------------------
106 hyperdataDocumentFromFrameWrite :: (HyperdataFrame, T.Text) -> Either T.Text HyperdataDocument
107 hyperdataDocumentFromFrameWrite (HyperdataFrame { _hf_base, _hf_frame_id }, contents) =
108 case parseLines contents of
109 Left _ -> Left "Error parsing node"
110 Right (Parsed { authors, contents = c, date, source, title = t }) ->
111 let authorJoinSingle (Author { firstName, lastName }) = T.concat [ lastName, ", ", firstName ]
112 authors' = T.concat $ authorJoinSingle <$> authors
113 date' = (\(Date { year, month, day }) -> T.concat [ T.pack $ show year, "-"
114 , T.pack $ show month, "-"
115 , T.pack $ show day ]) <$> date
116 year' = fromIntegral $ maybe 2021 (\(Date { year }) -> year) date
117 month' = fromIntegral $ maybe 10 (\(Date { month }) -> month) date
118 day' = fromIntegral $ maybe 4 (\(Date { day }) -> day) date in
119 Right HyperdataDocument { _hd_bdd = Just "FrameWrite"
120 , _hd_doi = Nothing
121 , _hd_url = Nothing
122 , _hd_uniqId = Nothing
123 , _hd_uniqIdBdd = Nothing
124 , _hd_page = Nothing
125 , _hd_title = Just t
126 , _hd_authors = Just authors'
127 , _hd_institutes = Nothing
128 , _hd_source = source
129 , _hd_abstract = Just c
130 , _hd_publication_date = date'
131 , _hd_publication_year = Just year'
132 , _hd_publication_month = Just month'
133 , _hd_publication_day = Just day'
134 , _hd_publication_hour = Nothing
135 , _hd_publication_minute = Nothing
136 , _hd_publication_second = Nothing
137 , _hd_language_iso2 = Just $ T.pack $ show EN }