]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[FIX] mime files for a dependency (servant-static).
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33 import Safe (tailMay)
34
35 import Gargantext.Core.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
39
40 ---------------------------------------------------------------
41 data Doc = Doc
42 { d_docId :: !Int
43 , d_title :: !Text
44 , d_source :: !Text
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
48 , d_abstract :: !Text
49 , d_authors :: !Text
50 }
51 deriving (Show)
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 HyperdataDocument (Just "CSV")
57 (Just did)
58 Nothing
59 Nothing
60 (Just dt)
61 (Just dau)
62 (Just ds)
63 (Just dab)
64 (Nothing)
65 Nothing
66 (Just dpy)
67 (Just dpm)
68 (Just dpd)
69 Nothing
70 Nothing
71 Nothing
72 ---------------------------------------------------------------
73 -- | Types Conversions
74 toDocs :: Vector CsvDoc -> [Doc]
75 toDocs v = V.toList
76 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
77 -> Doc nId t s py pm pd abst auth )
78 (V.enumFromN 1 (V.length v'')) v''
79 where
80 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
81 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
82
83 ---------------------------------------------------------------
84 fromDocs :: Vector Doc -> Vector CsvDoc
85 fromDocs docs = V.map fromDocs' docs
86 where
87 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
88
89 ---------------------------------------------------------------
90 -- | Split a document in its context
91 -- TODO adapt the size of the paragraph according to the corpus average
92
93 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
94 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
95 if docSize > 1000
96 then
97 if (mod (round m) docSize) >= 10
98 then
99 splitDoc' splt doc
100 else
101 V.fromList [doc]
102 else
103 V.fromList [doc]
104
105
106 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
107 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
108 where
109 firstDoc = CsvDoc t s py pm pd firstAbstract auth
110 firstAbstract = head' abstracts
111
112 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
113
114 abstracts = (splitBy $ contextSize) abst
115 head' x = maybe "" identity (head x)
116 tail' x = maybe [""] identity (tailMay x)
117
118 ---------------------------------------------------------------
119 ---------------------------------------------------------------
120 type Mean = Double
121
122 docsSize :: Vector CsvDoc -> Mean
123 docsSize csvDoc = mean ls
124 where
125 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
126
127
128 ---------------------------------------------------------------
129 data CsvDoc = CsvDoc
130 { csv_title :: !Text
131 , csv_source :: !Text
132 , csv_publication_year :: !Int
133 , csv_publication_month :: !Int
134 , csv_publication_day :: !Int
135 , csv_abstract :: !Text
136 , csv_authors :: !Text
137 }
138 deriving (Show)
139
140 instance FromNamedRecord CsvDoc where
141 parseNamedRecord r = CsvDoc <$> r .: "title"
142 <*> r .: "source"
143 <*> r .: "publication_year"
144 <*> r .: "publication_month"
145 <*> r .: "publication_day"
146 <*> r .: "abstract"
147 <*> r .: "authors"
148
149 instance ToNamedRecord CsvDoc where
150 toNamedRecord (CsvDoc t s py pm pd abst aut) =
151 namedRecord [ "title" .= t
152 , "source" .= s
153 , "publication_year" .= py
154 , "publication_month" .= pm
155 , "publication_day" .= pd
156 , "abstract" .= abst
157 , "authors" .= aut
158 ]
159
160
161 csvDecodeOptions :: DecodeOptions
162 csvDecodeOptions = (defaultDecodeOptions
163 {decDelimiter = fromIntegral $ ord '\t'}
164 )
165
166 csvEncodeOptions :: EncodeOptions
167 csvEncodeOptions = ( defaultEncodeOptions
168 {encDelimiter = fromIntegral $ ord '\t'}
169 )
170
171 ------------------------------------------------------------------------
172 ------------------------------------------------------------------------
173 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
174 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
175 <$> snd
176 <$> readCsv fp
177
178 ------------------------------------------------------------------------
179 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
180 readCsv fp = do
181 csvData <- BL.readFile fp
182 case decodeByNameWith csvDecodeOptions csvData of
183 Left e -> panic (pack e)
184 Right csvDocs -> pure csvDocs
185
186
187 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
188 writeCsv fp (h, vs) = BL.writeFile fp $
189 encodeByNameWith csvEncodeOptions h (V.toList vs)
190