]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[BAYES] detect lang (todo proba density).
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33 import Safe (tailMay)
34
35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
39
40 ---------------------------------------------------------------
41 data Doc = Doc
42 { d_docId :: !Int
43 , d_title :: !Text
44 , d_source :: !Text
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
48 , d_abstract :: !Text
49 , d_authors :: !Text
50 }
51 deriving (Show)
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
57 HyperdataDocument (Just "CSV")
58 (Just did)
59 Nothing
60 Nothing
61 Nothing
62 (Just dt)
63 (Just dau)
64 (Just dab)
65 (Nothing)
66 Nothing
67 (Just dpy)
68 (Just dpm)
69 (Just dpd)
70 Nothing
71 Nothing
72 Nothing
73 Nothing
74 ---------------------------------------------------------------
75 -- | Types Conversions
76 toDocs :: Vector CsvDoc -> [Doc]
77 toDocs v = V.toList
78 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
79 -> Doc nId t s py pm pd abst auth )
80 (V.enumFromN 1 (V.length v'')) v''
81 where
82 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
83 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
84
85 ---------------------------------------------------------------
86 fromDocs :: Vector Doc -> Vector CsvDoc
87 fromDocs docs = V.map fromDocs' docs
88 where
89 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
90
91 ---------------------------------------------------------------
92 -- | Split a document in its context
93 -- TODO adapt the size of the paragraph according to the corpus average
94
95 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
96 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
97 if docSize > 1000
98 then
99 if (mod (round m) docSize) >= 10
100 then
101 splitDoc' splt doc
102 else
103 V.fromList [doc]
104 else
105 V.fromList [doc]
106
107
108 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
109 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
110 where
111 firstDoc = CsvDoc t s py pm pd firstAbstract auth
112 firstAbstract = head' abstracts
113
114 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
115
116 abstracts = (splitBy $ contextSize) abst
117 head' x = maybe "" identity (head x)
118 tail' x = maybe [""] identity (tailMay x)
119
120 ---------------------------------------------------------------
121 ---------------------------------------------------------------
122 type Mean = Double
123
124 docsSize :: Vector CsvDoc -> Mean
125 docsSize csvDoc = mean ls
126 where
127 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
128
129
130 ---------------------------------------------------------------
131 data CsvDoc = CsvDoc
132 { csv_title :: !Text
133 , csv_source :: !Text
134 , csv_publication_year :: !Int
135 , csv_publication_month :: !Int
136 , csv_publication_day :: !Int
137 , csv_abstract :: !Text
138 , csv_authors :: !Text
139 }
140 deriving (Show)
141
142 instance FromNamedRecord CsvDoc where
143 parseNamedRecord r = CsvDoc <$> r .: "title"
144 <*> r .: "source"
145 <*> r .: "publication_year"
146 <*> r .: "publication_month"
147 <*> r .: "publication_day"
148 <*> r .: "abstract"
149 <*> r .: "authors"
150
151 instance ToNamedRecord CsvDoc where
152 toNamedRecord (CsvDoc t s py pm pd abst aut) =
153 namedRecord [ "title" .= t
154 , "source" .= s
155 , "publication_year" .= py
156 , "publication_month" .= pm
157 , "publication_day" .= pd
158 , "abstract" .= abst
159 , "authors" .= aut
160 ]
161
162
163 csvDecodeOptions :: DecodeOptions
164 csvDecodeOptions = (defaultDecodeOptions
165 {decDelimiter = fromIntegral $ ord '\t'}
166 )
167
168 csvEncodeOptions :: EncodeOptions
169 csvEncodeOptions = ( defaultEncodeOptions
170 {encDelimiter = fromIntegral $ ord '\t'}
171 )
172
173 ------------------------------------------------------------------------
174 ------------------------------------------------------------------------
175 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
176 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
177 <$> snd
178 <$> readCsv fp
179
180 ------------------------------------------------------------------------
181 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
182 readCsv fp = do
183 csvData <- BL.readFile fp
184 case decodeByNameWith csvDecodeOptions csvData of
185 Left e -> panic (pack e)
186 Right csvDocs -> pure csvDocs
187
188
189 readHal :: FilePath -> IO (Header, Vector CsvHal)
190 readHal fp = do
191 csvData <- BL.readFile fp
192 case decodeByNameWith csvDecodeOptions csvData of
193 Left e -> panic (pack e)
194 Right csvDocs -> pure csvDocs
195 ------------------------------------------------------------------------
196
197
198 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
199 writeCsv fp (h, vs) = BL.writeFile fp $
200 encodeByNameWith csvEncodeOptions h (V.toList vs)
201
202
203 ------------------------------------------------------------------------
204 -- Hal Format
205 data CsvHal = CsvHal
206 { csvHal_title :: !Text
207 , csvHal_source :: !Text
208 , csvHal_publication_year :: !Int
209 , csvHal_publication_month :: !Int
210 , csvHal_publication_day :: !Int
211 , csvHal_abstract :: !Text
212 , csvHal_authors :: !Text
213
214 , csvHal_url :: !Text
215 , csvHal_isbn_s :: !Text
216 , csvHal_issue_s :: !Text
217 , csvHal_journalPublisher_s:: !Text
218 , csvHal_language_s :: !Text
219
220 , csvHal_doiId_s :: !Text
221 , csvHal_authId_i :: !Text
222 , csvHal_instStructId_i :: !Text
223 , csvHal_deptStructId_i :: !Text
224 , csvHal_labStructId_i :: !Text
225
226 , csvHal_rteamStructId_i :: !Text
227 , csvHal_docType_s :: !Text
228 }
229 deriving (Show)
230
231 instance FromNamedRecord CsvHal where
232 parseNamedRecord r = CsvHal <$> r .: "title"
233 <*> r .: "source"
234 <*> r .: "publication_year"
235 <*> r .: "publication_month"
236 <*> r .: "publication_day"
237 <*> r .: "abstract"
238 <*> r .: "authors"
239
240 <*> r .: "url"
241 <*> r .: "isbn_s"
242 <*> r .: "issue_s"
243 <*> r .: "journalPublisher_s"
244 <*> r .: "language_s"
245
246 <*> r .: "doiId_s"
247 <*> r .: "authId_i"
248 <*> r .: "instStructId_i"
249 <*> r .: "deptStructId_i"
250 <*> r .: "labStructId_i"
251
252 <*> r .: "rteamStructId_i"
253 <*> r .: "docType_s"
254
255 instance ToNamedRecord CsvHal where
256 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
257 namedRecord [ "title" .= t
258 , "source" .= s
259 , "publication_year" .= py
260 , "publication_month" .= pm
261 , "publication_day" .= pd
262 , "abstract" .= abst
263 , "authors" .= aut
264
265 , "url" .= url
266 , "isbn_s" .= isbn
267 , "issue_s" .= iss
268 , "journalPublisher_s" .= jour
269 , "language_s" .= lang
270
271 , "doiId_s" .= doi
272 , "authId_i" .= auth
273 , "instStructId_i" .= inst
274 , "deptStructId_i" .= dept
275 , "labStructId_i" .= lab
276
277 , "rteamStructId_i" .= team
278 , "docType_s" .= doct
279 ]