]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/GrandDebat.hs
[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / GrandDebat.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.GrandDebat
3 Description : Grand Debat Types
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 _flowCorpusDebat :: FlowCmdM env err m
11 => User -> Either CorpusName [CorpusId]
12 -> Limit -> FilePath
13 -> m CorpusId
14 _flowCorpusDebat u n l fp = do
15 docs <- liftBase ( splitEvery 500
16 <$> take l
17 <$> readFile' fp
18 :: IO [[GD.GrandDebatReference ]]
19 )
20 flowCorpus u n (Multi FR) (map (map toHyperdataDocument) docs)
21
22
23 -}
24
25
26 module Gargantext.Core.Text.Corpus.Parsers.GrandDebat
27 where
28
29 import Data.Aeson (ToJSON, FromJSON)
30 import Data.Text (Text)
31 import GHC.Generics (Generic)
32 import Gargantext.Core (Lang(..))
33 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..), ToHyperdataDocument, toHyperdataDocument)
34 import Gargantext.Prelude
35 import Gargantext.Database.GargDB
36 import qualified Data.ByteString.Lazy as DBL
37 import qualified Data.JsonStream.Parser as P
38 import qualified Data.Text as Text
39
40 data GrandDebatReference = GrandDebatReference
41 { id :: !(Maybe Text)
42 , reference :: !(Maybe Text)
43 , title :: !(Maybe Text)
44
45 , createdAt :: !(Maybe Text)
46 , publishedAt :: !(Maybe Text)
47 , updatedAt :: !(Maybe Text)
48
49 , trashed :: !(Maybe Bool)
50 , trashedStatus :: !(Maybe Text)
51
52 , authorId :: !(Maybe Text)
53 , authorType :: !(Maybe Text)
54 , authorZipCode :: !(Maybe Text)
55
56 , responses :: !(Maybe [GrandDebatResponse])
57 }
58 deriving (Show, Generic)
59
60
61 data GrandDebatResponse = GrandDebatResponse
62 { questionId :: !(Maybe Text)
63 , questionTitle :: !(Maybe Text)
64 , value :: !(Maybe Text)
65 , formattedValue :: !(Maybe Text)
66 }
67 deriving (Show, Generic)
68
69 instance FromJSON GrandDebatResponse
70 instance FromJSON GrandDebatReference
71
72 instance ToJSON GrandDebatResponse
73 instance ToJSON GrandDebatReference
74
75
76 instance ToHyperdataDocument GrandDebatReference
77 where
78 toHyperdataDocument (GrandDebatReference { id, title, publishedAt, authorType, authorZipCode, responses }) =
79 HyperdataDocument { _hd_bdd = Just "GrandDebat"
80 , _hd_doi = id
81 , _hd_url = Nothing
82 , _hd_uniqId = Nothing
83 , _hd_uniqIdBdd = Nothing
84 , _hd_page = Nothing
85 , _hd_title = title
86 , _hd_authors = authorType
87 , _hd_institutes = authorType
88 , _hd_source = authorZipCode
89 , _hd_abstract = toAbstract <$> responses
90 , _hd_publication_date = publishedAt
91 , _hd_publication_year = Nothing
92 , _hd_publication_month = Nothing
93 , _hd_publication_day = Nothing
94 , _hd_publication_hour = Nothing
95 , _hd_publication_minute = Nothing
96 , _hd_publication_second = Nothing
97 , _hd_language_iso2 = Just $ Text.pack $ show FR }
98 where
99 toAbstract = (Text.intercalate " . ") . ((filter (/= "")) . (map toSentence))
100 toSentence (GrandDebatResponse _id _qtitle _qvalue r) = case r of
101 Nothing -> ""
102 Just r' -> case Text.length r' > 10 of
103 True -> r'
104 False -> ""
105
106 instance ReadFile [GrandDebatReference]
107 where
108 -- | read json: 3 version below are working but with increased optimization
109 --readFile fp = maybe [] identity <$> decode <$> DBL.readFile fp
110 --readFile fp = either (panic . Text.pack) identity <$> P.eitherDecode <$> DBL.readFile fp
111 readFile' fp = P.parseLazyByteString (P.arrayOf P.value) <$> DBL.readFile fp
112
113
114