From 6fee512f809750baddbf36f857ecbce3c025f2ed Mon Sep 17 00:00:00 2001 From: cy_altern Date: Mon, 29 May 2023 18:43:34 +0200 Subject: [PATCH] Feat: pas de balises li et br pour la version CSV --- corpus_web_fonctions.php | 17 ++++++++++++++++- json_affiche_corpus.json.html | 12 ++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/corpus_web_fonctions.php b/corpus_web_fonctions.php index 395ea17..f81d832 100644 --- a/corpus_web_fonctions.php +++ b/corpus_web_fonctions.php @@ -330,7 +330,7 @@ function recup_liens_int($html) { foreach($match as $m) { $m[7] = preg_replace('/<(img|(\/)?span)[^>]*>/si', '', $m[7]); // liens exclus - if(in_array($m[3], $url_exclus) || str_contains($m[3], '#comment') || str_contains($m[3], '/user/') || in_array($m[7], $txt_exclus) || str_replace([' ', ' '], '', $m[7]) === '') { + if(in_array($m[3], $url_exclus) || str_contains($m[3], '#comment') || str_contains($m[3], '/user/') || str_contains($m[3], '/members/') || str_contains($m[3], '/subscription/') || in_array($m[7], $txt_exclus) || str_replace([' ', ' '], '', $m[7]) === '') { continue; } if (!array_key_exists($m[3], $liens_int)) { @@ -347,3 +347,18 @@ function recup_liens_int($html) { return count($l_int) ? "" : ''; } + +function sans_li($html) { + $html = str_replace(['', '
  • '], '', $html); + $html = str_replace('
  • ', "\r\n", $html); + $html = str_replace(' ', " ", $html); + + return $html; +} + +function sans_br($html) { + $html = str_replace('
    ', "\r\n", $html); + $html = str_replace(' ', " ", $html); + + return $html; +} \ No newline at end of file diff --git a/json_affiche_corpus.json.html b/json_affiche_corpus.json.html index cf8bee7..147468a 100644 --- a/json_affiche_corpus.json.html +++ b/json_affiche_corpus.json.html @@ -94,12 +94,12 @@ "title": [(#GET{title}|json_encode)], "desc": [(#GET{description}|json_encode)], "keywords": [(#GET{keywords}|json_encode)], - "trackers": [(#GET{trackers}|json_encode)], - "rezos": [(#GET{rezos}|json_encode)], - "titres": [(#GET{titres}|json_encode)], - "liens_ext": [(#GET{liens_ext}|json_encode)], - "liens_int": [(#GET{liens_int}|json_encode)], - "text": [(#GET{text}|json_encode)] + "trackers": [(#GET{trackers}|sans_li|json_encode)], + "rezos": [(#GET{rezos}|sans_li|json_encode)], + "titres": [(#GET{titres}|sans_li|json_encode)], + "liens_ext": [(#GET{liens_ext}|sans_li|json_encode)], + "liens_int": [(#GET{liens_int}|sans_li|json_encode)], + "text": [(#GET{text}sans_br|json_encode)] } } [(#SET{id_jsonl, #GET{id_jsonl}|plus{1}})]