+
+ mb_substitute_character("none");
+
+ $error = libxml_get_last_error();
+
+ // libxml compiled without iconv?
+ if ($error && $error->code == 32) {
+ $data = $this->normalize_encoding($data);
+
+ if ($data) {
+ libxml_clear_errors();
+
+ $this->doc = new DOMDocument();
+ $this->doc->loadXML($data);
+
+ $error = libxml_get_last_error();
+ }
+ }
+
+ // some terrible invalid unicode entity?
+ if ($error) {
+ foreach (libxml_get_errors() as $err) {
+ if ($err->code == 9) {
+ // if the source feed is not in utf8, next conversion will fail
+ $data = $this->normalize_encoding($data);
+
+ // remove dangling bytes
+ $data = mb_convert_encoding($data, 'UTF-8', 'UTF-8');
+
+ // apparently not all UTF-8 characters are valid for XML
+ $data = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $data);
+
+ if ($data) {
+ libxml_clear_errors();
+
+ $this->doc = new DOMDocument();
+ $this->doc->loadXML($data);
+
+ $error = libxml_get_last_error();
+ }
+ break;
+ }
+ }
+ }
+
+ if ($error) {
+ foreach (libxml_get_errors() as $error) {
+ if ($error->level == LIBXML_ERR_FATAL) {
+ if(!isset($this->error)) //currently only the first error is reported
+ $this->error = $this->format_error($error);
+ $this->libxml_errors [] = $this->format_error($error);
+ }
+ }
+ }