]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[DBFLOW] getViewDocument + uniqIdBdd (enables duplicatas from different Database...
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33 import Safe (tailMay)
34
35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
39
40 ---------------------------------------------------------------
41 data Doc = Doc
42 { d_docId :: !Int
43 , d_title :: !Text
44 , d_source :: !Text
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
48 , d_abstract :: !Text
49 , d_authors :: !Text
50 }
51 deriving (Show)
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
57 HyperdataDocument (Just "CSV")
58 (Just . pack . show $ did)
59 Nothing
60 Nothing
61 Nothing
62 Nothing
63 (Just dt)
64 (Just dau)
65 (Just dab)
66 (Nothing)
67 Nothing
68 (Just dpy)
69 (Just dpm)
70 (Just dpd)
71 Nothing
72 Nothing
73 Nothing
74 Nothing
75 ---------------------------------------------------------------
76 -- | Types Conversions
77 toDocs :: Vector CsvDoc -> [Doc]
78 toDocs v = V.toList
79 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
80 -> Doc nId t s py pm pd abst auth )
81 (V.enumFromN 1 (V.length v'')) v''
82 where
83 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
84 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
85
86 ---------------------------------------------------------------
87 fromDocs :: Vector Doc -> Vector CsvDoc
88 fromDocs docs = V.map fromDocs' docs
89 where
90 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
91
92 ---------------------------------------------------------------
93 -- | Split a document in its context
94 -- TODO adapt the size of the paragraph according to the corpus average
95
96 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
97 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
98 if docSize > 1000
99 then
100 if (mod (round m) docSize) >= 10
101 then
102 splitDoc' splt doc
103 else
104 V.fromList [doc]
105 else
106 V.fromList [doc]
107
108
109 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
110 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
111 where
112 firstDoc = CsvDoc t s py pm pd firstAbstract auth
113 firstAbstract = head' abstracts
114
115 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
116
117 abstracts = (splitBy $ contextSize) abst
118 head' x = maybe "" identity (head x)
119 tail' x = maybe [""] identity (tailMay x)
120
121 ---------------------------------------------------------------
122 ---------------------------------------------------------------
123 type Mean = Double
124
125 docsSize :: Vector CsvDoc -> Mean
126 docsSize csvDoc = mean ls
127 where
128 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
129
130
131 ---------------------------------------------------------------
132 data CsvDoc = CsvDoc
133 { csv_title :: !Text
134 , csv_source :: !Text
135 , csv_publication_year :: !Int
136 , csv_publication_month :: !Int
137 , csv_publication_day :: !Int
138 , csv_abstract :: !Text
139 , csv_authors :: !Text
140 }
141 deriving (Show)
142
143 instance FromNamedRecord CsvDoc where
144 parseNamedRecord r = CsvDoc <$> r .: "title"
145 <*> r .: "source"
146 <*> r .: "publication_year"
147 <*> r .: "publication_month"
148 <*> r .: "publication_day"
149 <*> r .: "abstract"
150 <*> r .: "authors"
151
152 instance ToNamedRecord CsvDoc where
153 toNamedRecord (CsvDoc t s py pm pd abst aut) =
154 namedRecord [ "title" .= t
155 , "source" .= s
156 , "publication_year" .= py
157 , "publication_month" .= pm
158 , "publication_day" .= pd
159 , "abstract" .= abst
160 , "authors" .= aut
161 ]
162
163
164 csvDecodeOptions :: DecodeOptions
165 csvDecodeOptions = (defaultDecodeOptions
166 {decDelimiter = fromIntegral $ ord '\t'}
167 )
168
169 csvEncodeOptions :: EncodeOptions
170 csvEncodeOptions = ( defaultEncodeOptions
171 {encDelimiter = fromIntegral $ ord '\t'}
172 )
173
174 ------------------------------------------------------------------------
175 ------------------------------------------------------------------------
176 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
177 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
178 <$> snd
179 <$> readCsv fp
180
181 ------------------------------------------------------------------------
182 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
183 readCsv fp = do
184 csvData <- BL.readFile fp
185 case decodeByNameWith csvDecodeOptions csvData of
186 Left e -> panic (pack e)
187 Right csvDocs -> pure csvDocs
188
189
190 readHal :: FilePath -> IO (Header, Vector CsvHal)
191 readHal fp = do
192 csvData <- BL.readFile fp
193 case decodeByNameWith csvDecodeOptions csvData of
194 Left e -> panic (pack e)
195 Right csvDocs -> pure csvDocs
196 ------------------------------------------------------------------------
197
198
199 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
200 writeCsv fp (h, vs) = BL.writeFile fp $
201 encodeByNameWith csvEncodeOptions h (V.toList vs)
202
203
204 ------------------------------------------------------------------------
205 -- Hal Format
206 data CsvHal = CsvHal
207 { csvHal_title :: !Text
208 , csvHal_source :: !Text
209 , csvHal_publication_year :: !Int
210 , csvHal_publication_month :: !Int
211 , csvHal_publication_day :: !Int
212 , csvHal_abstract :: !Text
213 , csvHal_authors :: !Text
214
215 , csvHal_url :: !Text
216 , csvHal_isbn_s :: !Text
217 , csvHal_issue_s :: !Text
218 , csvHal_journalPublisher_s:: !Text
219 , csvHal_language_s :: !Text
220
221 , csvHal_doiId_s :: !Text
222 , csvHal_authId_i :: !Text
223 , csvHal_instStructId_i :: !Text
224 , csvHal_deptStructId_i :: !Text
225 , csvHal_labStructId_i :: !Text
226
227 , csvHal_rteamStructId_i :: !Text
228 , csvHal_docType_s :: !Text
229 }
230 deriving (Show)
231
232 instance FromNamedRecord CsvHal where
233 parseNamedRecord r = CsvHal <$> r .: "title"
234 <*> r .: "source"
235 <*> r .: "publication_year"
236 <*> r .: "publication_month"
237 <*> r .: "publication_day"
238 <*> r .: "abstract"
239 <*> r .: "authors"
240
241 <*> r .: "url"
242 <*> r .: "isbn_s"
243 <*> r .: "issue_s"
244 <*> r .: "journalPublisher_s"
245 <*> r .: "language_s"
246
247 <*> r .: "doiId_s"
248 <*> r .: "authId_i"
249 <*> r .: "instStructId_i"
250 <*> r .: "deptStructId_i"
251 <*> r .: "labStructId_i"
252
253 <*> r .: "rteamStructId_i"
254 <*> r .: "docType_s"
255
256 instance ToNamedRecord CsvHal where
257 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
258 namedRecord [ "title" .= t
259 , "source" .= s
260 , "publication_year" .= py
261 , "publication_month" .= pm
262 , "publication_day" .= pd
263 , "abstract" .= abst
264 , "authors" .= aut
265
266 , "url" .= url
267 , "isbn_s" .= isbn
268 , "issue_s" .= iss
269 , "journalPublisher_s" .= jour
270 , "language_s" .= lang
271
272 , "doiId_s" .= doi
273 , "authId_i" .= auth
274 , "instStructId_i" .= inst
275 , "deptStructId_i" .= dept
276 , "labStructId_i" .= lab
277
278 , "rteamStructId_i" .= team
279 , "docType_s" .= doct
280 ]