]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/FrameWrite.hs
Merge remote-tracking branch 'origin/551-dev-graphql-contexts-ngrams' into dev-merge
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / FrameWrite.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.FrameWrite
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 module Gargantext.Core.Text.Corpus.Parsers.FrameWrite
13 where
14
15 import Control.Applicative ((*>))
16 import Control.Monad (void)
17 import Data.Either
18 import Data.Maybe
19 import Data.Text hiding (foldl)
20 import Gargantext.Core.Text (sentences)
21 import Gargantext.Prelude
22 import Prelude ((++), read)
23 import Text.Parsec hiding (Line)
24 import Text.Parsec.String
25 import qualified Data.Text as DT
26 import qualified Data.List as List
27
28
29 -- https://gitlab.iscpif.fr/gargantext/purescript-gargantext/issues/331
30
31 -- Authors : default : anonymous ; except if the following line is encountered
32 -- ^authors: FirstName1, LastName1 ; FirstName2, LastName2 ; etc.
33 -- date : default : date of last change except if the following line is encountered ^@@date: 2021-09-10
34 -- source: Name of the root node except if the following line is encountered ^@@source:
35 -- By default, 1 framawrite node = 1 document. Option for further developments: allow to give a level at generation for the split within framawrite node : :
36 --
37 -- par défaut: un doc == 1 NodeWrite
38 -- ## mean each ## section will be a new document with title the subsubsection title. Either it features options for author, date etc. or it will inherit the document's option.
39
40 sample :: Text
41 sample =
42 unlines
43 [ "title1"
44 -- , "title2"
45 -- , "=="
46 -- , "^@@authors: FirstName1, LastName1; FirstName2, LastName2"
47 , "date: 2021-09-10"
48 , "source: someSource"
49 , "document contents 1"
50 , "document contents 2"
51 ]
52
53 sampleUnordered :: Text
54 sampleUnordered =
55 unlines
56 [ "title1"
57 , "title2"
58 , "=="
59 , "document contents 1"
60 , "date: 2021-09-10"
61 , "authors: FirstName1, LastName1; FirstName2, LastName2"
62 , "source: someSource"
63 , "document contents 2"
64 ]
65
66 -- parseSample = parse documentP "sample" (unpack sample)
67 -- parseSampleUnordered = parse documentP "sampleUnordered" (unpack sampleUnordered)
68 parseLinesSample :: Either ParseError Parsed
69 parseLinesSample = parseLines sample
70 parseLinesSampleUnordered :: Either ParseError Parsed
71 parseLinesSampleUnordered = parseLines sampleUnordered
72
73 data Author =
74 Author { firstName :: Text
75 , lastName :: Text }
76 deriving (Show)
77
78 data Parsed =
79 Parsed { title :: Text
80 , authors :: [Author]
81 , date :: Maybe Date
82 , source :: Maybe Text
83 , contents :: Text }
84 deriving (Show)
85
86 emptyParsed :: Parsed
87 emptyParsed =
88 Parsed { title = ""
89 , authors = []
90 , date = Nothing
91 , source = Nothing
92 , contents = "" }
93
94 data Date =
95 Date { year :: Integer
96 , month :: Integer
97 , day :: Integer
98 }
99 deriving (Show)
100
101 data Line =
102 LAuthors [Author]
103 | LContents Text
104 | LDate Date
105 | LSource Text
106 | LTitle Text
107 deriving (Show)
108
109 parseLines :: Text -> Either ParseError Parsed
110 parseLines text = foldl f emptyParsed <$> lst
111 where
112 lst = parse documentLines "" (unpack text)
113 f (Parsed { .. }) (LAuthors as) = Parsed { authors = as , .. }
114 f (Parsed { .. }) (LContents c) = Parsed { contents = DT.unlines [contents, c], .. }
115 f (Parsed { .. }) (LDate d ) = Parsed { date = Just d , .. }
116 f (Parsed { .. }) (LSource s ) = Parsed { source = Just s , .. }
117 f (Parsed { .. }) (LTitle t ) = Parsed { title = t , .. }
118
119 -- Source should be the name of the node
120 -- First line of each Context should be the title.
121 documentLinesP :: Parser [Line]
122 documentLinesP = do
123 t <- titleP
124 ls <- lineP `sepBy` newline
125 pure $ [LTitle $ pack t] ++ ls
126
127 documentLines :: Parser [Line]
128 documentLines = do
129 ls <- lineP `sepBy` newline
130 pure ls
131
132 lineP :: Parser Line
133 lineP = do
134 choice [ try authorsLineP
135 , try dateLineP
136 , try sourceLineP
137 , contentsLineP
138 ]
139
140 authorsLineP :: Parser Line
141 authorsLineP = do
142 authors <- authorsP
143 pure $ LAuthors authors
144
145 dateLineP :: Parser Line
146 dateLineP = do
147 date <- dateP
148 pure $ LDate date
149
150 sourceLineP :: Parser Line
151 sourceLineP = do
152 source <- sourceP
153 pure $ LSource $ pack source
154
155 contentsLineP :: Parser Line
156 contentsLineP = do
157 contents <- many (noneOf "\n")
158 pure $ LContents $ pack contents
159
160 --------------------
161
162 -- documentP = do
163 -- t <- titleP
164 -- a <- optionMaybe authorsP
165 -- d <- optionMaybe dateP
166 -- s <- optionMaybe sourceP
167 -- c <- contentsP
168 -- pure $ Parsed { title = pack t
169 -- , authors = fromMaybe [] a
170 -- , date = pack <$> d
171 -- , source = pack <$> s
172 -- , contents = pack c }
173
174 titleDelimiterP :: Parser ()
175 titleDelimiterP = do
176 _ <- newline
177 -- _ <- try (string "==")
178 pure ()
179
180 titleP :: Parser [Char]
181 titleP = manyTill anyChar (try titleDelimiterP)
182
183 authorsPrefixP :: Parser [Char]
184 authorsPrefixP = do
185 _ <- string "authors:"
186 many (char ' ')
187 authorsP :: Parser [Author]
188 authorsP = try authorsPrefixP *> sepBy authorP (char ';')
189 authorP :: Parser Author
190 authorP = do
191 fn <- manyTill anyChar (char ',')
192 _ <- many (char ' ')
193 --ln <- manyTill anyChar (void (char ';') <|> tokenEnd)
194 --ln <- manyTill anyChar (tokenEnd)
195 ln <- many (noneOf "\n")
196 pure $ Author { firstName = pack fn, lastName = pack ln }
197 -- manyTill anyChar (void (char '\n') <|> eof)
198
199 datePrefixP :: Parser [Char]
200 datePrefixP = do
201 _ <- string "date:"
202 many (char ' ')
203 dateP :: Parser Date
204 dateP = try datePrefixP
205 *> dateISOP
206 -- *> many (noneOf "\n")
207
208 dateISOP :: Parser Date
209 dateISOP = do
210 year <- rd <$> number
211 _ <- char '-'
212 month <- rd <$> number
213 _ <- char '-'
214 day <- rd <$> number
215 _ <- many (noneOf "\n" )
216 pure $ Date { year, month, day }
217 where
218 rd = read :: [Char] -> Integer
219 number = many1 digit
220
221 sourceP :: Parser [Char]
222 sourceP = try sourcePrefixP
223 *> many (noneOf "\n")
224 where
225 sourcePrefixP :: Parser [Char]
226 sourcePrefixP = do
227 _ <- string "source:"
228 many (char ' ')
229
230 -- contentsP :: Parser String
231 -- contentsP = many anyChar
232
233 tokenEnd :: Parser ()
234 tokenEnd = void (char '\n') <|> eof
235
236 --- MISC Tools
237 -- Using ChunkAlong here enable redundancies in short corpora of texts
238 -- maybe use splitEvery or chunkAlong depending on the size of the whole text
239 text2titleParagraphs :: Int -> Text -> [(Text, Text)]
240 text2titleParagraphs n = catMaybes
241 . List.map doTitle
242 . (chunkAlong n' n)
243 -- . (splitEvery n)
244 . sentences
245 . DT.intercalate " " -- ". "
246 . List.filter (/= "")
247 . DT.lines
248 where
249 n' = n + (round $ (fromIntegral n) / (2 :: Double))
250
251 doTitle :: [Text] -> Maybe (Text, Text)
252 doTitle (t:ts) = Just (t, DT.intercalate " " ts)
253 doTitle [] = Nothing
254
255
256 clean :: Text -> Text
257 clean = DT.unwords . List.filter (\w -> DT.length w < 25) . DT.words
258