diff --git a/content/contenu_jsonl.html b/content/contenu_jsonl.html index 732ad32..91ee9b2 100644 --- a/content/contenu_jsonl.html +++ b/content/contenu_jsonl.html @@ -45,6 +45,23 @@
Titres :
[(#VALEUR|base64_decode|recup_titres)]
+
  • +
    Trackers :
    +
    [(#VALEUR|base64_decode|trouve_trackers)]
    +
  • +
  • +
    Rezos socios :
    +
    [(#VALEUR|base64_decode|trouve_rezos)]
    +
  • +
  • +
    Liens externes :
    +
    [(#VALEUR|base64_decode|recup_liens_ext)]
    +
  • +
  • +
    Liens internes :
    +
    [(#VALEUR|base64_decode|recup_liens_int)]
    +
  • + ] [(#CLE|=={extractionContent}|oui) @@ -52,7 +69,7 @@
  • [(#CLE|replace{htmlmeta:head:, meta }|replace{boilerpipe:, ''}|ucfirst)] :
    - [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })] + [(#VALEUR|print|supprimer_com|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]
  • ] diff --git a/corpus_web_fonctions.php b/corpus_web_fonctions.php index de2b8d3..395ea17 100644 --- a/corpus_web_fonctions.php +++ b/corpus_web_fonctions.php @@ -191,4 +191,159 @@ function recup_aspi($nom_fichier_jsonl) { $T = explode('_', $nom_fichier_jsonl)[1]; $T = explode('-', $T); return $T[0].'-'.$T[1].'-'.$T[2]; -} \ No newline at end of file +} + +function supprimer_com($boiler_text) { + return preg_replace('/Poster un nouveau commentaire.*/si', '', $boiler_text); +} + +function trouve_trackers($html) { + // google(-?)analytics|xiti.com|ranktrackr.net + $trackers = []; + preg_match_all('/google(-?)analytics|xiti\.com|ranktrackr\.net|seoposition\.com/si', $html, $match, PREG_SET_ORDER); + if ($match) { + foreach($match as $m) { + if (strtolower(str_replace(['-', ' '], '', $m[0])) === 'googleanalytics') { + if (!in_array('google analytics', $trackers)) { + $trackers[] = 'google analytics'; + } + } else { + if (!in_array(strtolower($m[0]), $trackers)) { + $trackers[] = strtolower($m[0]); + } + } + } + } + + return count($trackers) ? "" : ''; +} + +function trouve_rezos($html) { + // facebook.com|twitter.com|pinterest.com|plusone + $rezos = []; + preg_match_all('/facebook\.com|twitter\.com|plusone/si', $html, $match, PREG_SET_ORDER); + if ($match) { + foreach($match as $m) { + switch (strtolower($m[0])) { + case 'facebook.com': + if (!in_array('Facebook', $rezos)) { + $rezos[] = 'Facebook'; + } + break; + case 'twitter.com': + if (!in_array('Twitter', $rezos)) { + $rezos[] = 'Twitter'; + } + break; + case 'plusone': + if (!in_array('Google Plus One', $rezos)) { + $rezos[] = 'Google Plus One'; + } + break; + } + } + } + + return count($rezos) ? "" : ''; +} + +function recup_liens_ext($html) { + // 1) { + $Tdomaine = explode('.', $domaine); + $domaine = array_pop($Tdomaine); + $domaine = array_pop($Tdomaine).'.'.$domaine; + } + if(in_array($domaine, $exclus)) { + continue; + } + if (!in_array($m[2], $liens_ext)) { + $liens_ext[] = $m[2]; + } + } + } + + return count($liens_ext) ? "" : ''; +} + +function recup_liens_int($html) { + // ]*)?">(<(img|p|acronym)[^>]*>)?(.*?)<\/a>/si', $html, $match, PREG_SET_ORDER); + if ($match) { + foreach($match as $m) { + $m[7] = preg_replace('/<(img|(\/)?span)[^>]*>/si', '', $m[7]); + // liens exclus + if(in_array($m[3], $url_exclus) || str_contains($m[3], '#comment') || str_contains($m[3], '/user/') || in_array($m[7], $txt_exclus) || str_replace([' ', ' '], '', $m[7]) === '') { + continue; + } + if (!array_key_exists($m[3], $liens_int)) { + $liens_int[$m[3]] = $m[7]; + } + } + } + $l_int = []; + if (count($liens_int)) { + foreach ($liens_int as $url => $titre) { + $l_int[] = $url.' : '.$titre; + } + } + + return count($l_int) ? "" : ''; +} diff --git a/json_affiche_corpus.json.html b/json_affiche_corpus.json.html index 06f97e4..b4560b5 100644 --- a/json_affiche_corpus.json.html +++ b/json_affiche_corpus.json.html @@ -9,20 +9,36 @@ "title": "Meta title", "desc": "Meta description", "keywords": "Meta keywords", + "trackers": "Trackers", + "rezos": "Rézos socios", "titres": "titres", + "liens_ext": "Liens externes", + "liens_int": "Liens internes", "text": "Texte" }, "filtreCol" : { "site" : "select", "date_aspi": "select", - "num_page": "input", + "num_page": "select", "title": "input", "desc": "input", "keywords": "input", + "trackers": "select", + "rezos": "select", "titres": "input", + "liens_ext": "input", + "liens_int": "input", "text": "input" }, "classes":{ + "num_page": "w80p", + "title": "w120p", + "desc": "w120p", + "keywords": "w200p", + "trackers": "w100p", + "rezos": "w100p", + "liens_int": "maxw400p", + "text": "minw400p" } } #SET{id_jsonl,1} @@ -32,17 +48,23 @@ [(#SET{num_ligne,[(#COMPTEUR_BOUCLE|=={1}|?{1,#GET{nb_lignes}})]})] [(#SET{content,[(#FILE|affiche_ligne{#GET{num_ligne}}|json_decode{true})]})] - #SET{num_page,Page #VALEUR} #SET{title,''} #SET{keywords,''} #SET{description,''} #SET{titres,''} + #SET{num_page,Page #VALEUR} #SET{title,''} #SET{keywords,''} #SET{description,''} #SET{titres,''} #SET{trackers,''} #SET{rezos,''} #SET{liens_ext,''} #SET{liens_int,''} [(#CLE|=={extractionContent}|oui) - [(#CLE|=={boilerpipe:text}|oui) [(#SET{text, text: [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]})] ] - [(#CLE|=={htmlmeta:head:title}|oui) [(#SET{title, title: [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]})] ] - [(#CLE|=={htmlmeta:head:keywords}|oui) [(#SET{keywords, keywords: [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]})] ] - [(#CLE|=={htmlmeta:head:description}|oui) [(#SET{description, desc: [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]})] ] + [(#CLE|=={boilerpipe:text}|oui) + [(#SET{text, [(#VALEUR|print|supprimer_com|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]})] + ] + [(#CLE|=={htmlmeta:head:title}|oui) [(#SET{title, [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]})] ] + [(#CLE|=={htmlmeta:head:keywords}|oui) [(#SET{keywords, [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    }|replace{',', ', '})]})] ] + [(#CLE|=={htmlmeta:head:description}|oui) [(#SET{description, [(#VALEUR|print|replace{
    ,§§}|replace{
    ,§§}|replace{<,<}|replace{>,>}|replace{§§,
    }|replace{"\n",
    })]})] ] ][(#CLE|=={htmlBytes}|oui) [(#SET{titres, [(#VALEUR|base64_decode|recup_titres)]})] + [(#SET{trackers, [(#VALEUR|base64_decode|trouve_trackers)]})] + [(#SET{rezos, [(#VALEUR|base64_decode|trouve_rezos)]})] + [(#SET{liens_ext, [(#VALEUR|base64_decode|recup_liens_ext)]})] + [(#SET{liens_int, [(#VALEUR|base64_decode|recup_liens_int)]})] ] ,{ @@ -54,7 +76,11 @@ "title": [(#GET{title}|json_encode)], "desc": [(#GET{description}|json_encode)], "keywords": [(#GET{keywords}|json_encode)], + "trackers": [(#GET{trackers}|json_encode)], + "rezos": [(#GET{rezos}|json_encode)], "titres": [(#GET{titres}|json_encode)], + "liens_ext": [(#GET{liens_ext}|json_encode)], + "liens_int": [(#GET{liens_int}|json_encode)], "text": [(#GET{text}|json_encode)] }, "classes": { @@ -68,7 +94,11 @@ "title": [(#GET{title}|json_encode)], "desc": [(#GET{description}|json_encode)], "keywords": [(#GET{keywords}|json_encode)], + "trackers": [(#GET{trackers}|json_encode)], + "rezos": [(#GET{rezos}|json_encode)], "titres": [(#GET{titres}|json_encode)], + "liens_ext": [(#GET{liens_ext}|json_encode)], + "liens_int": [(#GET{liens_int}|json_encode)], "text": [(#GET{text}|json_encode)] } } [(#SET{id_jsonl, #GET{id_jsonl}|plus{1}})]