364 lines
No EOL
9.3 KiB
PHP
364 lines
No EOL
9.3 KiB
PHP
<?php
|
||
/**
|
||
* Fonctions utiles au plugin Traitement corpus web
|
||
*
|
||
* @plugin Traitement corpus web
|
||
* @copyright 2021
|
||
* @author gamuza
|
||
* @licence GNU/GPL
|
||
* @package SPIP\Corpus\Fonctions
|
||
*/
|
||
|
||
if (!defined('_ECRIRE_INC_VERSION')) {
|
||
return;
|
||
}
|
||
|
||
function cree_titre($id_video) {
|
||
if ($id_video != intval($id_video)) {
|
||
return false;
|
||
}
|
||
$res = sql_allfetsel('title,description', 'spip_corpus_videos', 'id_corpus_video ='.$id_video);
|
||
$titre = $res[0]['title'];
|
||
$description = $res[0]['description'];
|
||
|
||
if ($titre == '' AND $description != '') {
|
||
$set = ['title' => couper($description, 100)];
|
||
sql_updateq('spip_corpus_videos', $set, 'id_corpus_video='.$id_video);
|
||
}
|
||
|
||
return couper($description, 100);
|
||
}
|
||
|
||
function integre_xml($fichier, $id_video) {
|
||
$stop_words = explode(',', lire_config('corpus_web/stop_words'));
|
||
|
||
$chemin = _DIR_TMP.lire_config('corpus_web/repertoire_transcriptions').'/'.$fichier;
|
||
if ($id_video != intval($id_video)) {
|
||
return 'erreur : absence d\'id_video ou format incorrect ('.$id_video.')';
|
||
}
|
||
|
||
if (!file_exists($chemin) OR $id_video != intval($id_video)) {
|
||
$content = 'fichier XML absent';
|
||
}
|
||
else {
|
||
$content = file_get_contents($chemin);
|
||
preg_match_all('/<Word [^>]*> ([^<]*) /i', $content, $matches, PREG_PATTERN_ORDER);
|
||
$content = array_unique($matches[1]);
|
||
$content = array_diff($content, $stop_words);
|
||
$content = join(',', $content);
|
||
}
|
||
if ($content == '') {
|
||
$content = 'fichier XML vide';
|
||
}
|
||
|
||
$set = ['transcription_detailed_content'=> $content];
|
||
sql_updateq('spip_corpus_videos', $set, 'id_corpus_video='.$id_video);
|
||
if (sql_error()) {
|
||
return sql_error();
|
||
}
|
||
$retour = sql_getfetsel('transcription_detailed_content','spip_corpus_videos' , 'id_corpus_video='.$id_video);
|
||
return $retour;
|
||
}
|
||
|
||
function integre_txt($fichier, $id_video) {
|
||
$chemin = _DIR_TMP.lire_config('corpus_web/repertoire_transcriptions').'/'.$fichier;
|
||
if ($id_video != intval($id_video)) {
|
||
return 'erreur : absence d\'id_video ou format incorrect ('.$id_video.')';
|
||
}
|
||
|
||
if (!file_exists($chemin) OR $id_video != intval($id_video)) {
|
||
$content = 'fichier de transcription absent';
|
||
}
|
||
else {
|
||
$content = nettoyer_timecodes(file_get_contents($chemin));
|
||
}
|
||
if ($content == '') {
|
||
$content = 'fichier de transcription vide';
|
||
}
|
||
|
||
$set = ['transcription_readable_content'=> $content];
|
||
sql_updateq('spip_corpus_videos', $set, 'id_corpus_video='.$id_video);
|
||
if (sql_error()) {
|
||
return sql_error();
|
||
}
|
||
$retour = sql_getfetsel('transcription_readable_content','spip_corpus_videos' , 'id_corpus_video='.$id_video);
|
||
return $retour;
|
||
}
|
||
|
||
function nettoyer_timecodes($content) {
|
||
$content = preg_replace('/\[\d{2}:\d{2}:\d{2} - \d{2}:\d{2}:\d{2}\] /i',' ',$content);
|
||
$content = preg_replace('/\n\n/', '§', $content);
|
||
$content = preg_replace('/\n/', '', $content);
|
||
$content = preg_replace('/§/', "\n\n", $content);
|
||
return $content;
|
||
}
|
||
|
||
function nettoyer_balises($content) {
|
||
$content = preg_replace('/<[a-zA-Z0-9\/=\'" ?:.]*>/i', " ", $content);
|
||
return $content;
|
||
}
|
||
|
||
function nombre_ligne($fichier) {
|
||
if (!file_exists($fichier)) {
|
||
return false;
|
||
}
|
||
$num_ligne = 0;
|
||
$handle = @fopen($fichier, "rb");
|
||
if ($handle) {
|
||
while (($buffer = fgets($handle)) !== false) {
|
||
$num_ligne = $num_ligne + 1;
|
||
}
|
||
}
|
||
return $num_ligne;
|
||
}
|
||
|
||
function affiche_ligne($fichier, $num) {
|
||
if (!file_exists($fichier)) {
|
||
return false;
|
||
}
|
||
|
||
$handle = @fopen($fichier, "rb");
|
||
if ($handle) {
|
||
$num_ligne = 0;
|
||
while (($buffer = fgets($handle)) !== false) {
|
||
$num_ligne = $num_ligne + 1;
|
||
if ($num_ligne == $num) {
|
||
return $buffer;
|
||
}
|
||
}
|
||
if (!feof($handle)) {
|
||
echo "Erreur: fgets() a échoué\n";
|
||
}
|
||
fclose($handle);
|
||
}
|
||
return 'Erreur : nombre de lignes du fichier '.$fichier.' inférieur à '.$num;
|
||
}
|
||
|
||
// TO DO : corriger la récup du contenu des titres (cf site brezhoweb.com)
|
||
function affiche_titraille($html) {
|
||
$niv_h = [1,2,3,4,5,6];
|
||
$titraille = [];
|
||
foreach($niv_h as $h) {
|
||
preg_match_all('/<h'.$h.'(.*)>(.*)?<\/h'.$h.'>/si', $html, $match, PREG_SET_ORDER);
|
||
if ($match) {
|
||
foreach($match as $m) {
|
||
$titraille[] = '<h'.$h.'>'.$m[2].'</h'.$h.'>';
|
||
}
|
||
}
|
||
}
|
||
|
||
return join("\r\n", $titraille);
|
||
}
|
||
|
||
function affiche_title($html) {
|
||
preg_match_all('/<title>(.*)?<\/title>/si', $html, $match, PREG_SET_ORDER);
|
||
if ($match) {
|
||
foreach($match as $m) {
|
||
$titles[] = $m[1];
|
||
}
|
||
}
|
||
return join("\r\n", $titles);
|
||
}
|
||
|
||
function human_filesize($bytes, $decimals = 2) {
|
||
$sz = 'BKMGTP';
|
||
$factor = floor((strlen($bytes) - 1) / 3);
|
||
return sprintf("%.{$decimals}f", $bytes / pow(1024, $factor)) . @$sz[$factor];
|
||
}
|
||
|
||
function recup_titres($html) {
|
||
// regexp : class="(titre|field-content)
|
||
// class="(titre|field-content)">(<[^>]*>)?([^<>]*)
|
||
$titres = [];
|
||
$html = str_replace(['<em>','</em>'], '', $html);
|
||
preg_match_all('/class="(titre|field-content|nodeTitle nobreak)">(<[^>]*>)?([^<>]*)/si', $html, $match, PREG_SET_ORDER);
|
||
if ($match) {
|
||
foreach($match as $m) {
|
||
if ($m[3] !== '') {
|
||
$titres[] = $m[3];
|
||
}
|
||
}
|
||
}
|
||
|
||
return count($titres) ? "<ul><li>".join("</li><li>", $titres)."</li></ul>" : '';
|
||
}
|
||
|
||
function recup_site($nom_fichier_jsonl) {
|
||
return explode('_', $nom_fichier_jsonl)[0];
|
||
}
|
||
|
||
function recup_aspi($nom_fichier_jsonl) {
|
||
$T = explode('_', $nom_fichier_jsonl)[1];
|
||
$T = explode('-', $T);
|
||
return $T[0].'-'.$T[1].'-'.$T[2];
|
||
}
|
||
|
||
function supprimer_com($boiler_text) {
|
||
return preg_replace('/Poster un nouveau commentaire.*/si', '', $boiler_text);
|
||
}
|
||
|
||
function trouve_trackers($html) {
|
||
// google(-?)analytics|xiti.com|ranktrackr.net
|
||
$trackers = [];
|
||
preg_match_all('/google(-?)analytics|xiti\.com|ranktrackr\.net|seoposition\.com/si', $html, $match, PREG_SET_ORDER);
|
||
if ($match) {
|
||
foreach($match as $m) {
|
||
if (strtolower(str_replace(['-', ' '], '', $m[0])) === 'googleanalytics') {
|
||
if (!in_array('google analytics', $trackers)) {
|
||
$trackers[] = 'google analytics';
|
||
}
|
||
} else {
|
||
if (!in_array(strtolower($m[0]), $trackers)) {
|
||
$trackers[] = strtolower($m[0]);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return count($trackers) ? "<ul><li>".join("</li><li>", $trackers)."</li></ul>" : '';
|
||
}
|
||
|
||
function trouve_rezos($html) {
|
||
// facebook.com|twitter.com|pinterest.com|plusone
|
||
$rezos = [];
|
||
preg_match_all('/facebook\.com|twitter\.com|plusone/si', $html, $match, PREG_SET_ORDER);
|
||
if ($match) {
|
||
foreach($match as $m) {
|
||
switch (strtolower($m[0])) {
|
||
case 'facebook.com':
|
||
if (!in_array('Facebook', $rezos)) {
|
||
$rezos[] = 'Facebook';
|
||
}
|
||
break;
|
||
case 'twitter.com':
|
||
if (!in_array('Twitter', $rezos)) {
|
||
$rezos[] = 'Twitter';
|
||
}
|
||
break;
|
||
case 'plusone':
|
||
if (!in_array('Google Plus One', $rezos)) {
|
||
$rezos[] = 'Google Plus One';
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
return count($rezos) ? "<ul><li>".join("</li><li>", $rezos)."</li></ul>" : '';
|
||
}
|
||
|
||
function recup_liens_ext($html) {
|
||
// <a href="http(s?)://(?!www.o2zone.tv|t.co)([^"]*)
|
||
$liens_ext = [];
|
||
$exclus = [
|
||
'w3.org',
|
||
'adobe.com',
|
||
'xiti.com',
|
||
'ranktrackr.net',
|
||
'seoposition.com',
|
||
'facebook.com',
|
||
'twitter.com',
|
||
];
|
||
preg_match_all('/<a href="http(s?):\/\/(?!www\.o2zone.tv|t\.co)([^"]*)/si', $html, $match, PREG_SET_ORDER);
|
||
if ($match) {
|
||
foreach($match as $m) {
|
||
// domaines exclus
|
||
$domaine = explode('/', $m[2])[0];
|
||
if (substr_count($domaine, '.') > 1) {
|
||
$Tdomaine = explode('.', $domaine);
|
||
$domaine = array_pop($Tdomaine);
|
||
$domaine = array_pop($Tdomaine).'.'.$domaine;
|
||
}
|
||
if(in_array($domaine, $exclus)) {
|
||
continue;
|
||
}
|
||
if (!in_array($m[2], $liens_ext)) {
|
||
$liens_ext[] = $m[2];
|
||
}
|
||
}
|
||
}
|
||
|
||
return count($liens_ext) ? "<ul><li>".join("</li><li>", $liens_ext)."</li></ul>" : '';
|
||
}
|
||
|
||
function recup_liens_int($html) {
|
||
// <a href="http(s?)://(?!www.o2zone.tv|t.co)([^"]*)
|
||
$liens_int = [];
|
||
$txt_exclus = [
|
||
'Accueil',
|
||
'Edito',
|
||
'En savoir plus...',
|
||
'Plus de mots-clés',
|
||
'Lire la suite »',
|
||
' Lire la suite »',
|
||
' Lire la suite »',
|
||
'Ajouter un commentaire',
|
||
'dernier »',
|
||
'suivant ›',
|
||
'XHTML',
|
||
'répondre',
|
||
'Téléchargez le plus récent lecteur Flash',
|
||
'1',
|
||
'2',
|
||
'3',
|
||
'4',
|
||
'5',
|
||
'6',
|
||
'7',
|
||
'8',
|
||
'9',
|
||
'Partie1',
|
||
'Partie2',
|
||
'Partie3',
|
||
'Partie4',
|
||
];
|
||
$url_exclus = [
|
||
'/',
|
||
'/la-tele',
|
||
'/video',
|
||
'/la-boite-outils',
|
||
'/lassociation',
|
||
'/forum',
|
||
'/user',
|
||
'/aide-du-site-o2zone',
|
||
'/propos',
|
||
'/filter/tips',
|
||
'/rss.xml',
|
||
];
|
||
preg_match_all('/<a href="(http(s?):\/\/www\.o2zone\.tv)?(\/[^"]*)?([^>]*)?">(<(img|p|acronym)[^>]*>)?(.*?)<\/a>/si', $html, $match, PREG_SET_ORDER);
|
||
if ($match) {
|
||
foreach($match as $m) {
|
||
$m[7] = preg_replace('/<(img|(\/)?span)[^>]*>/si', '', $m[7]);
|
||
// liens exclus
|
||
if(in_array($m[3], $url_exclus) || str_contains($m[3], '#comment') || str_contains($m[3], '/user/') || str_contains($m[3], '/members/') || str_contains($m[3], '/subscription/') || in_array($m[7], $txt_exclus) || str_replace([' ', ' '], '', $m[7]) === '') {
|
||
continue;
|
||
}
|
||
if (!array_key_exists($m[3], $liens_int)) {
|
||
$liens_int[$m[3]] = $m[7];
|
||
}
|
||
}
|
||
}
|
||
$l_int = [];
|
||
if (count($liens_int)) {
|
||
foreach ($liens_int as $url => $titre) {
|
||
$l_int[] = $url.' : '.$titre;
|
||
}
|
||
}
|
||
|
||
return count($l_int) ? "<ul><li>".join("</li><li>", $l_int)."</li></ul>" : '';
|
||
}
|
||
|
||
function sans_li($html) {
|
||
$html = str_replace(['<ul>', '</ul>', '<li>'], '', $html);
|
||
$html = str_replace('</li>', "\r\n", $html);
|
||
$html = str_replace(' ', " ", $html);
|
||
|
||
return $html;
|
||
}
|
||
|
||
function sans_br($html) {
|
||
$html = str_replace('<br>', "\r\n", $html);
|
||
$html = str_replace(' ', " ", $html);
|
||
|
||
return $html;
|
||
} |