scanTheLinksFromVoss.php 5.27 KB
<?php
//$FILE='ooexport.html';
//$_LOSP_CHARSET='utf-8';
//$GLOBALS['nichtgefunden'] = 0;
//
//
//$Result_array = readFormatedFile($FILE);
//print_r($Result_array);

function readFormatedFile($FILE) {
	libxml_use_internal_errors(true);

	$var_SimpleXMLElement = simplexml_load_file($FILE);
	if (!$var_SimpleXMLElement) {
		echo "Laden des XML fehlgeschlagen\n";
		foreach(libxml_get_errors() as $error) {
			echo "\t", $error->message;
		}
	}

	$var_ObjVarSimpleXMLElement = $var_SimpleXMLElement->children();

	$h1=$h2=$h3=$result_i =  -1;
	$h1_array = $h2_array = $h3_array = $Result_array = array();
	foreach ($var_ObjVarSimpleXMLElement as $key_body => $value_body) {
		foreach ($value_body as $key_div => $value_div) {
			foreach ($value_div as $key_content => $value_content) {

				if($key_content=='h1') {
					$h1++;
					$h2=$h3 =  -1;
					$h2_array = $h3_array = array();
					$h1_array[$h1] = cleanString(getHValue($value_content));
					echo "\nh1 =\t".$h1_array[$h1];

				} else if($key_content=='h2') {
					$h2++;
					$h3 =  -1;
					$h3_array = array();
					$h2_array[$h2] = cleanString(getHValue($value_content));
					echo "\n\th2 =\t".$h2_array[$h2];
				} else if($key_content=='h3') {
					$h3++;
					$h3_array[$h3] = cleanString(getHValue($value_content));
					echo "\n\t\th3 =\t".$h3_array[$h3];

				} else if($key_content=='span') {
					echo "\n vernachlässigbar \n";
				} else if($key_content=='p') {

					$result_i++;
					$Result_array[$result_i] = getpValue($value_content);

					if(array_key_exists($h1, $h1_array)) {
						$Result_array[$result_i]['h1'] = $h1_array[$h1];
					}
					if(array_key_exists($h2, $h2_array)) {
						$Result_array[$result_i]['h2'] = $h2_array[$h2];
					}
					if(array_key_exists($h3, $h3_array)) {
						$Result_array[$result_i]['h3'] = $h3_array[$h3];
					}

				}
			}
		}
	}
	return $Result_array;
}

function checkpResult(array $param_result) {
	if(array_key_exists('info', $param_result) && $param_result['info']
			&& array_key_exists('link', $param_result) && $param_result['link']
	) {
		return true;
	} else {
		return false;
	}
}
function cleanString($param_str) {
	return trim(str_replace(array("\n"),' ', $param_str));
}

function getpValue(SimpleXMLElement $param_SimpleXMLElement) {
	$result=false;
	foreach ($param_SimpleXMLElement as $key => $value) {
		if($key=='span' ) {
			if($value->a) {
				$a_attributtes = $value->a->attributes();
				$result = array(
					'info'	=>	cleanString((string) $value),
					'link'	=>	cleanString((string) $a_attributtes['href'])
				);
				if(checkpResult($result)) {
//					print_r($result);	// http://wiki.ic.org/, http://www.coforum.de/, http://www.stiftung-trias.de/
					return $result;
				}
				
			} 
		} else if($key=='a') {
			if($param_SimpleXMLElement) {
				$a_attributtes = $param_SimpleXMLElement->a->attributes();
				if((bool) $param_SimpleXMLElement && cleanString((string) $a_attributtes['href'])) {
					$result = array(
						'info'	=>	cleanString((string) $param_SimpleXMLElement),
						'link'	=>	cleanString((string) $a_attributtes['href'])
					);
					if(checkpResult($result)) {
//						print_r($result);	2 ohne info http://www.solioeko.de/, http://www.architekturarchiv-web.de/nh.htm
						return $result;
					}
				} 
				$a_attributtes= false;
			} 
		}
	}
	$a_attributtes = $param_SimpleXMLElement->a->attributes();
//	echo "\n [".(bool)$param_SimpleXMLElement->span."][".$param_SimpleXMLElement->span."] (". array_key_exists('href', $a_attributtes) ."):".$a_attributtes['href'].": \n";
	if($param_SimpleXMLElement->span
			&& (bool)$a_attributtes
			&& cleanString((string) $a_attributtes['href'])) {
			
		$result = array(
			'info'	=>	cleanString((string) $param_SimpleXMLElement->span),
			'link'	=>	cleanString((string) $a_attributtes['href'])
		);
		if(checkpResult($result)) {
			
			return $result;
		}
	}

	if($param_SimpleXMLElement->span->span && cleanString((string) $a_attributtes['href'])) {
		$result = array(
			'info'	=>	cleanString((string) $param_SimpleXMLElement->span->span),
			'link'	=>	cleanString((string) $a_attributtes['href'])
		);
		if(checkpResult($result)) {
			
			return $result;
		}
	}

	if(is_array($result) && $result['link'] && !checkpResult($result)) {
		if($param_SimpleXMLElement->span[0]) {
			$result['info'] = cleanString((string) $param_SimpleXMLElement->span[0]);
			if(checkpResult($result)) {
				return $result;
			}
		}
		
		if($param_SimpleXMLElement->span[0]->span) {
			$result['info'] = cleanString((string) $param_SimpleXMLElement->span[0]->span);
			if(checkpResult($result)) {
//				print_r($result);
				return $result;
			}
		}

	
	}


	echo ':)';
	$GLOBALS['nichtgefunden']++;
	echo "\n\n ".$GLOBALS['nichtgefunden']." (".(string)$param_SimpleXMLElement.") \n";
	print_r($result);
	print_r($param_SimpleXMLElement);


}
function getHValue(SimpleXMLElement $param_SimpleXMLElement){
	foreach ($param_SimpleXMLElement as $key => $value) {
		if($key=='a' ) {
			if($value->span) {
				return getRecursiveSpanValue($value);
			}
		} else {
			return  getHValue($value);
		}
	}
}
function getRecursiveSpanValue(SimpleXMLElement $param_SimpleXMLElement){
	foreach ($param_SimpleXMLElement as $key => $value) {
		if($key=='span' ) {
			if($value->span) {
				return getRecursiveSpanValue($value);
			} else if($value) {
				return $value;
			} else {
			}
		} else {
		}
	}
}

?>