]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[FIX] overloaded Strings needed in csv parser.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33 import Safe (tailMay)
34
35 import Gargantext.Text
36 import Gargantext.Text.Context
37 import Gargantext.Prelude hiding (length)
38
39 ---------------------------------------------------------------
40 data Doc = Doc
41 { d_docId :: !Int
42 , d_title :: !Text
43 , d_source :: !Text
44 , d_publication_year :: !Int
45 , d_publication_month :: !Int
46 , d_publication_day :: !Int
47 , d_abstract :: !Text
48 , d_authors :: !Text
49 }
50 deriving (Show)
51 ---------------------------------------------------------------
52 toDocs :: Vector CsvDoc -> [Doc]
53 toDocs v = V.toList
54 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
55 -> Doc nId t s py pm pd abst auth )
56 (V.enumFromN 1 (V.length v'')) v''
57 where
58 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
59 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
60
61 ---------------------------------------------------------------
62 fromDocs :: Vector Doc -> Vector CsvDoc
63 fromDocs docs = V.map fromDocs' docs
64 where
65 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
66
67 ---------------------------------------------------------------
68 -- | Split a document in its context
69 -- TODO adapt the size of the paragraph according to the corpus average
70
71 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
72 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
73 if docSize > 1000
74 then
75 if (mod (round m) docSize) >= 10
76 then
77 splitDoc' splt doc
78 else
79 V.fromList [doc]
80 else
81 V.fromList [doc]
82
83
84 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
85 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
86 where
87 firstDoc = CsvDoc t s py pm pd firstAbstract auth
88 firstAbstract = head' abstracts
89
90 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
91
92 abstracts = (splitBy $ contextSize) abst
93 head' x = maybe "" identity (head x)
94 tail' x = maybe [""] identity (tailMay x)
95
96 ---------------------------------------------------------------
97 ---------------------------------------------------------------
98 type Mean = Double
99
100 docsSize :: Vector CsvDoc -> Mean
101 docsSize csvDoc = mean ls
102 where
103 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
104
105
106 ---------------------------------------------------------------
107 data CsvDoc = CsvDoc
108 { csv_title :: !Text
109 , csv_source :: !Text
110 , csv_publication_year :: !Int
111 , csv_publication_month :: !Int
112 , csv_publication_day :: !Int
113 , csv_abstract :: !Text
114 , csv_authors :: !Text
115 }
116 deriving (Show)
117
118 instance FromNamedRecord CsvDoc where
119 parseNamedRecord r = CsvDoc <$> r .: "title"
120 <*> r .: "source"
121 <*> r .: "publication_year"
122 <*> r .: "publication_month"
123 <*> r .: "publication_day"
124 <*> r .: "abstract"
125 <*> r .: "authors"
126
127 instance ToNamedRecord CsvDoc where
128 toNamedRecord (CsvDoc t s py pm pd abst aut) =
129 namedRecord [ "title" .= t
130 , "source" .= s
131 , "publication_year" .= py
132 , "publication_month" .= pm
133 , "publication_day" .= pd
134 , "abstract" .= abst
135 , "authors" .= aut
136 ]
137
138
139 csvDecodeOptions :: DecodeOptions
140 csvDecodeOptions = (defaultDecodeOptions
141 {decDelimiter = fromIntegral $ ord '\t'}
142 )
143
144 csvEncodeOptions :: EncodeOptions
145 csvEncodeOptions = ( defaultEncodeOptions
146 {encDelimiter = fromIntegral $ ord '\t'}
147 )
148
149
150 ------------------------------------------------------------------------
151 ------------------------------------------------------------------------
152 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
153 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
154 <$> snd
155 <$> readCsv fp
156
157 ------------------------------------------------------------------------
158 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
159 readCsv fp = do
160 csvData <- BL.readFile fp
161 case decodeByNameWith csvDecodeOptions csvData of
162 Left e -> panic (pack e)
163 Right csvDocs -> pure csvDocs
164
165
166 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
167 writeCsv fp (h, vs) = BL.writeFile fp $
168 encodeByNameWith csvEncodeOptions h (V.toList vs)
169