diff --git a/inc/Sitemapper.php b/inc/Sitemapper.php new file mode 100644 index 0000000000000000000000000000000000000000..52c71c545ad5fb975634581af33bd11d9a2379b4 --- /dev/null +++ b/inc/Sitemapper.php @@ -0,0 +1,201 @@ +<?php +/** + * Sitemap handling functions + * + * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) + * @author Michael Hamann <michael@content-space.de> + */ + +if(!defined('DOKU_INC')) die('meh.'); + +/** + * A class for building sitemaps and pinging search engines with the sitemap URL. + * + * @author Michael Hamann + */ +class Sitemapper { + /** + * Builds a Google Sitemap of all public pages known to the indexer + * + * The map is placed in the cache directory named sitemap.xml.gz - This + * file needs to be writable! + * + * @author Michael Hamann + * @author Andreas Gohr + * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html + * @link http://www.sitemaps.org/ + */ + public function generate(){ + global $conf; + if($conf['sitemap'] < 1 || !is_numeric($conf['sitemap'])) return false; + + $sitemap = Sitemapper::getFilePath(); + + if(@file_exists($sitemap)){ + if(!is_writable($sitemap)) return false; + }else{ + if(!is_writable(dirname($sitemap))) return false; + } + + if(@filesize($sitemap) && + @filemtime($sitemap) > (time()-($conf['sitemap']*86400))){ // 60*60*24=86400 + dbglog('Sitemapper::generate(): Sitemap up to date'); // FIXME: only in debug mode + return false; + } + + dbglog("Sitemapper::generate(): using $sitemap"); // FIXME: Only in debug mode + + $pages = idx_getIndex('page', ''); + dbglog('Sitemapper::generate(): creating sitemap using '.count($pages).' pages'); + $items = array(); + + // build the sitemap items + foreach($pages as $id){ + //skip hidden, non existing and restricted files + if(isHiddenPage($id)) continue; + if(auth_aclcheck($id,'','') < AUTH_READ) continue; + $items[] = SitemapItem::createFromID($id); + } + + $eventData = array('items' => &$items, 'sitemap' => &$sitemap); + $event = new Doku_Event('SITEMAP_GENERATE', $eventData); + if ($event->advise_before(true)) { + //save the new sitemap + $result = io_saveFile($sitemap, Sitemapper::getXML($items)); + } + $event->advise_after(); + + return $result; + } + + /** + * Builds the sitemap XML string from the given array auf SitemapItems. + * + * @param $items array The SitemapItems that shall be included in the sitemap. + * @return string The sitemap XML. + * @author Michael Hamann + */ + private function getXML($items) { + ob_start(); + echo '<?xml version="1.0" encoding="UTF-8"?>'.NL; + echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'.NL; + foreach ($items as $item) { + echo $item->toXML(); + } + echo '</urlset>'.NL; + $result = ob_get_contents(); + ob_end_clean(); + return $result; + } + + /** + * Helper function for getting the path to the sitemap file. + * + * @return The path to the sitemap file. + * @author Michael Hamann + */ + public function getFilePath() { + global $conf; + + $sitemap = $conf['cachedir'].'/sitemap.xml'; + if($conf['compression'] === 'bz2' || $conf['compression'] === 'gz'){ + $sitemap .= '.gz'; + } + + return $sitemap; + } + + /** + * Pings search engines with the sitemap url. Plugins can add or remove + * urls to ping using the SITEMAP_PING event. + * + * @author Michael Hamann + */ + public function pingSearchEngines() { + //ping search engines... + $http = new DokuHTTPClient(); + $http->timeout = 8; + + $encoded_sitemap_url = urlencode(wl('', array('do' => 'sitemap'), true, '&')); + $ping_urls = array( + 'google' => 'http://www.google.com/webmasters/sitemaps/ping?sitemap='.$encoded_sitemap_url, + 'yahoo' => 'http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=dokuwiki&url='.$encoded_sitemap_url, + 'microsoft' => 'http://www.bing.com/webmaster/ping.aspx?siteMap='.$encoded_sitemap_url, + ); + + $data = array('ping_urls' => $ping_urls, + 'encoded_sitemap_url' => $encoded_sitemap_url + ); + $event = new Doku_Event('SITEMAP_PING', $data); + if ($event->advise_before(true)) { + foreach ($data['ping_urls'] as $name => $url) { + dbglog("Sitemapper::PingSearchEngines(): pinging $name"); + $resp = $http->get($url); + if($http->error) dbglog("Sitemapper:pingSearchengines(): $http->error"); + dbglog('Sitemapper:pingSearchengines(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp))); + } + } + $event->advise_after(); + + return true; + } +} + +/** + * An item of a sitemap. + * + * @author Michael Hamann + */ +class SitemapItem { + public $url; + public $lastmod; + public $changefreq; + public $priority; + + /** + * Create a new item. + * + * @param $url string The url of the item + * @param $lastmod int Timestamp of the last modification + * @param $changefreq string How frequently the item is likely to change. Valid values: always, hourly, daily, weekly, monthly, yearly, never. + * @param $priority float|string The priority of the item relative to other URLs on your site. Valid values range from 0.0 to 1.0. + */ + public function __construct($url, $lastmod, $changefreq = null, $priority = null) { + $this->url = $url; + $this->lastmod = $lastmod; + $this->changefreq = $changefreq; + $this->priority = $priority; + } + + /** + * Helper function for creating an item for a wikipage id. + * + * @param $id string A wikipage id. + * @param $changefreq string How frequently the item is likely to change. Valid values: always, hourly, daily, weekly, monthly, yearly, never. + * @param $priority float|string The priority of the item relative to other URLs on your site. Valid values range from 0.0 to 1.0. + * @return The sitemap item. + */ + public static function createFromID($id, $changefreq = null, $priority = null) { + $id = trim($id); + $date = @filemtime(wikiFN($id)); + if(!$date) return NULL; + return new SitemapItem(wl($id, '', true), $date, $changefreq, $priority); + } + + /** + * Get the XML representation of the sitemap item. + * + * @return The XML representation. + */ + public function toXML() { + $result = ' <url>'.NL + .' <loc>'.hsc($this->url).'</loc>'.NL + .' <lastmod>'.date_iso8601($this->lastmod).'</lastmod>'.NL; + if ($this->changefreq !== NULL) + $result .= ' <changefreq>'.hsc($this->changefreq).'</changefreq>'.NL; + if ($this->priority !== NULL) + $result .= ' <priority>'.hsc($this->priority).'</priority>'.NL; + $result .= ' </url>'.NL; + return $result; + } +} diff --git a/inc/actions.php b/inc/actions.php index 0a6e6d8c7fa96912df2948d4a1b4dfbbee47b56a..9db7d5f24e18883b4c4dea893fe8a03411477455 100644 --- a/inc/actions.php +++ b/inc/actions.php @@ -53,6 +53,11 @@ function act_dispatch(){ //check permissions $ACT = act_permcheck($ACT); + //sitemap + if ($ACT == 'sitemap'){ + $ACT = act_sitemap($ACT); + } + //register if($ACT == 'register' && $_POST['save'] && register()){ $ACT = 'login'; @@ -205,7 +210,7 @@ function act_clean($act){ 'preview','search','show','check','index','revisions', 'diff','recent','backlink','admin','subscribe','revert', 'unsubscribe','profile','resendpwd','recover', - 'draftdel','subscribens','unsubscribens',)) && substr($act,0,7) != 'export_' ) { + 'draftdel','subscribens','unsubscribens','sitemap')) && substr($act,0,7) != 'export_' ) { msg('Command unknown: '.htmlspecialchars($act),-1); return 'show'; } @@ -233,7 +238,8 @@ function act_permcheck($act){ }else{ $permneed = AUTH_CREATE; } - }elseif(in_array($act,array('login','search','recent','profile','index'))){ + }elseif(in_array($act,array('login','search','recent','profile','index', 'sitemap'))){ + }elseif(in_array($act,array('login','search','recent','profile','sitemap'))){ $permneed = AUTH_NONE; }elseif($act == 'revert'){ $permneed = AUTH_ADMIN; @@ -586,6 +592,51 @@ function act_export($act){ return 'show'; } +/** + * Handle sitemap delivery + * + * @author Michael Hamann <michael@content-space.de> + */ +function act_sitemap($act) { + global $conf; + + if ($conf['sitemap'] < 1 || !is_numeric($conf['sitemap'])) { + header("HTTP/1.0 404 Not Found"); + print "Sitemap generation is disabled."; + exit; + } + + $sitemap = Sitemapper::getFilePath(); + if(strrchr($sitemap, '.') === '.gz'){ + $mime = 'application/x-gzip'; + }else{ + $mime = 'application/xml; charset=utf-8'; + } + + // Check if sitemap file exists, otherwise create it + if (!is_readable($sitemap)) { + Sitemapper::generate(); + } + + if (is_readable($sitemap)) { + // Send headers + header('Content-Type: '.$mime); + + http_conditionalRequest(filemtime($sitemap)); + + // Send file + //use x-sendfile header to pass the delivery to compatible webservers + if (http_sendfile($sitemap)) exit; + + readfile($sitemap); + exit; + } + + header("HTTP/1.0 500 Internal Server Error"); + print "Could not read the sitemap file - bad permissions?"; + exit; +} + /** * Handle page 'subscribe' * diff --git a/inc/common.php b/inc/common.php index 3e760419f27fbd501c9abcb2c7e9fc2b2e19726b..894b8a020b3af30bbf61ea48ca1796e9a99ff0fa 100644 --- a/inc/common.php +++ b/inc/common.php @@ -1270,6 +1270,21 @@ function dformat($dt=null,$format=''){ return strftime($format,$dt); } +/** + * Formats a timestamp as ISO 8601 date + * + * @author <ungu at terong dot com> + * @link http://www.php.net/manual/en/function.date.php#54072 + */ +function date_iso8601($int_date) { + //$int_date: current date in UNIX timestamp + $date_mod = date('Y-m-d\TH:i:s', $int_date); + $pre_timezone = date('O', $int_date); + $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2); + $date_mod .= $time_zone; + return $date_mod; +} + /** * return an obfuscated email address in line with $conf['mailguard'] setting * diff --git a/inc/load.php b/inc/load.php index 2f5be6d63c76b4007fae859411261a2dac5ec144..478ee7c761209dc1574aaf5a16c375837f0881c2 100644 --- a/inc/load.php +++ b/inc/load.php @@ -74,6 +74,7 @@ function load_autoload($name){ 'DokuWikiFeedCreator' => DOKU_INC.'inc/feedcreator.class.php', 'Doku_Parser_Mode' => DOKU_INC.'inc/parser/parser.php', 'SafeFN' => DOKU_INC.'inc/SafeFN.class.php', + 'Sitemapper' => DOKU_INC.'inc/Sitemapper.php', 'DokuWiki_Action_Plugin' => DOKU_PLUGIN.'action.php', 'DokuWiki_Admin_Plugin' => DOKU_PLUGIN.'admin.php', diff --git a/lib/exe/indexer.php b/lib/exe/indexer.php index 3a9673ed6aaabd9fd5fe39d52c76b07c52dbe949..f35f9ed7294921b3a23ae6709f9e38b961b6e57d 100644 --- a/lib/exe/indexer.php +++ b/lib/exe/indexer.php @@ -229,88 +229,10 @@ function metaUpdate(){ * @link https://www.google.com/webmasters/sitemaps/docs/en/about.html */ function runSitemapper(){ - global $conf; print "runSitemapper(): started".NL; - if(!$conf['sitemap']) return false; - - if($conf['compression'] == 'bz2' || $conf['compression'] == 'gz'){ - $sitemap = 'sitemap.xml.gz'; - }else{ - $sitemap = 'sitemap.xml'; - } - print "runSitemapper(): using $sitemap".NL; - - if(@file_exists(DOKU_INC.$sitemap)){ - if(!is_writable(DOKU_INC.$sitemap)) return false; - }else{ - if(!is_writable(DOKU_INC)) return false; - } - - if(@filesize(DOKU_INC.$sitemap) && - @filemtime(DOKU_INC.$sitemap) > (time()-($conf['sitemap']*60*60*24))){ - print 'runSitemapper(): Sitemap up to date'.NL; - return false; - } - - $pages = idx_getIndex('page', ''); - print 'runSitemapper(): creating sitemap using '.count($pages).' pages'.NL; - - // build the sitemap - ob_start(); - print '<?xml version="1.0" encoding="UTF-8"?>'.NL; - print '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'.NL; - foreach($pages as $id){ - $id = trim($id); - $file = wikiFN($id); - - //skip hidden, non existing and restricted files - if(isHiddenPage($id)) continue; - $date = @filemtime($file); - if(!$date) continue; - if(auth_aclcheck($id,'','') < AUTH_READ) continue; - - print ' <url>'.NL; - print ' <loc>'.wl($id,'',true).'</loc>'.NL; - print ' <lastmod>'.date_iso8601($date).'</lastmod>'.NL; - print ' </url>'.NL; - } - print '</urlset>'.NL; - $data = ob_get_contents(); - ob_end_clean(); - - //save the new sitemap - io_saveFile(DOKU_INC.$sitemap,$data); - - //ping search engines... - $http = new DokuHTTPClient(); - $http->timeout = 8; - - //ping google - print 'runSitemapper(): pinging google'.NL; - $url = 'http://www.google.com/webmasters/sitemaps/ping?sitemap='; - $url .= urlencode(DOKU_URL.$sitemap); - $resp = $http->get($url); - if($http->error) print 'runSitemapper(): '.$http->error.NL; - print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; - - //ping yahoo - print 'runSitemapper(): pinging yahoo'.NL; - $url = 'http://search.yahooapis.com/SiteExplorerService/V1/updateNotification?appid=dokuwiki&url='; - $url .= urlencode(DOKU_URL.$sitemap); - $resp = $http->get($url); - if($http->error) print 'runSitemapper(): '.$http->error.NL; - print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; - - //ping microsoft - print 'runSitemapper(): pinging microsoft'.NL; - $url = 'http://www.bing.com/webmaster/ping.aspx?siteMap='; - $url .= urlencode(DOKU_URL.$sitemap); - $resp = $http->get($url); - if($http->error) print 'runSitemapper(): '.$http->error.NL; - print 'runSitemapper(): '.preg_replace('/[\n\r]/',' ',strip_tags($resp)).NL; - + $result = Sitemapper::generate() && Sitemapper::pingSearchEngines(); print 'runSitemapper(): finished'.NL; - return true; + return $result; } /** @@ -405,21 +327,6 @@ function sendDigest() { $_SERVER['REMOTE_USER'] = $olduser; } -/** - * Formats a timestamp as ISO 8601 date - * - * @author <ungu at terong dot com> - * @link http://www.php.net/manual/en/function.date.php#54072 - */ -function date_iso8601($int_date) { - //$int_date: current date in UNIX timestamp - $date_mod = date('Y-m-d\TH:i:s', $int_date); - $pre_timezone = date('O', $int_date); - $time_zone = substr($pre_timezone, 0, 3).":".substr($pre_timezone, 3, 2); - $date_mod .= $time_zone; - return $date_mod; -} - /** * Just send a 1x1 pixel blank gif to the browser *