]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[MERGE] distances for Graph.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.String (IsString)
29 import Data.Text (Text, pack, unpack, length)
30 import qualified Data.ByteString.Lazy as BL
31
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
34 import Safe (tailMay)
35 import Text.HTML.TagSoup
36
37 import Gargantext.Text
38 import Gargantext.Prelude hiding (length)
39
40 ---------------------------------------------------------------
41 data Doc = Doc
42 { d_docId :: !Int
43 , d_title :: !Text
44 , d_source :: !Text
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
48 , d_abstract :: !Text
49 , d_authors :: !Text
50 }
51 deriving (Show)
52 ---------------------------------------------------------------
53 toDocs :: Vector CsvDoc -> [Doc]
54 toDocs v = V.toList
55 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
56 -> Doc nId t s py pm pd abst auth )
57 (V.enumFromN 1 (V.length v'')) v''
58 where
59 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
60 seps= (V.fromList [Paragraph, Sentences, Chars])
61
62 ---------------------------------------------------------------
63 fromDocs :: Vector Doc -> Vector CsvDoc
64 fromDocs docs = V.map fromDocs' docs
65 where
66 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
67
68 ---------------------------------------------------------------
69 -- | Split a document in its context
70 -- TODO adapt the size of the paragraph according to the corpus average
71
72 data SplitBy = Paragraph | Sentences | Chars
73
74 splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
75 splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
76 if docSize > 1000
77 then
78 if (mod (round m) docSize) >= 10
79 then
80 splitDoc' splt doc
81 else
82 V.fromList [doc]
83 else
84 V.fromList [doc]
85
86
87 splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
88 splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
89 where
90 firstDoc = CsvDoc t s py pm pd firstAbstract auth
91 firstAbstract = head' abstracts
92
93 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
94
95 abstracts = (splitBy splt) abst
96 head' x = maybe "" identity (head x)
97 tail' x = maybe [""] identity (tailMay x)
98
99
100 splitBy :: SplitBy -> Text -> [Text]
101 splitBy Chars = map pack . chunkAlong 1000 1 . unpack
102 splitBy Sentences = map unsentences . chunkAlong 20 1 . sentences
103 splitBy Paragraph = map removeTag . filter isTagText . parseTags
104 where
105 removeTag :: IsString p => Tag p -> p
106 removeTag (TagText x) = x
107 removeTag (TagComment x) = x
108 removeTag _ = ""
109
110 ---------------------------------------------------------------
111 ---------------------------------------------------------------
112 type Mean = Double
113
114 docsSize :: Vector CsvDoc -> Mean
115 docsSize csvDoc = mean ls
116 where
117 ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
118
119
120 ---------------------------------------------------------------
121 data CsvDoc = CsvDoc
122 { c_title :: !Text
123 , c_source :: !Text
124 , c_publication_year :: !Int
125 , c_publication_month :: !Int
126 , c_publication_day :: !Int
127 , c_abstract :: !Text
128 , c_authors :: !Text
129 }
130 deriving (Show)
131
132 instance FromNamedRecord CsvDoc where
133 parseNamedRecord r = CsvDoc <$> r .: "title"
134 <*> r .: "source"
135 <*> r .: "publication_year"
136 <*> r .: "publication_month"
137 <*> r .: "publication_day"
138 <*> r .: "abstract"
139 <*> r .: "authors"
140
141 instance ToNamedRecord CsvDoc where
142 toNamedRecord (CsvDoc t s py pm pd abst aut) =
143 namedRecord [ "title" .= t
144 , "source" .= s
145 , "publication_year" .= py
146 , "publication_month" .= pm
147 , "publication_day" .= pd
148 , "abstract" .= abst
149 , "authors" .= aut
150 ]
151
152
153 csvDecodeOptions :: DecodeOptions
154 csvDecodeOptions = (defaultDecodeOptions
155 {decDelimiter = fromIntegral $ ord '\t'}
156 )
157
158 csvEncodeOptions :: EncodeOptions
159 csvEncodeOptions = ( defaultEncodeOptions
160 {encDelimiter = fromIntegral $ ord '\t'}
161 )
162
163
164 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
165 readCsv fp = do
166 csvData <- BL.readFile fp
167 case decodeByNameWith csvDecodeOptions csvData of
168 Left e -> panic (pack e)
169 Right csvDocs -> pure csvDocs
170
171
172 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
173 writeCsv fp (h, vs) = BL.writeFile fp $
174 encodeByNameWith csvEncodeOptions h (V.toList vs)
175