summaryrefslogtreecommitdiff
blob: d6fecb25d88dafffd38edd5058866322e80c0e73 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
<?php
/**
 * Generate sitemap files in base XML as well as some namespace extensions.
 *
 * This module generates two different base sitemaps.
 *
 * 1. sitemap.xml
 *    The basic sitemap is updated regularly by wp-cron. It is stored in the
 *    database and retrieved when requested. This sitemap aims to include canonical
 *    URLs for all published content and abide by the sitemap spec. This is the root
 *    of a tree of sitemap and sitemap index xml files, depending on the number of URLs.
 *
 *    By default the sitemap contains published posts of type 'post' and 'page', as
 *    well as the home url. To include other post types use the 'jetpack_sitemap_post_types'
 *    filter.
 *
 * @link http://sitemaps.org/protocol.php Base sitemaps protocol.
 * @link https://support.google.com/webmasters/answer/178636 Image sitemap extension.
 * @link https://developers.google.com/webmasters/videosearch/sitemaps Video sitemap extension.
 *
 * 2. news-sitemap.xml
 *    The news sitemap is generated on the fly when requested. It does not aim for
 *    completeness, instead including at most 1000 of the most recent published posts
 *    from the previous 2 days, per the news-sitemap spec.
 *
 * @link http://www.google.com/support/webmasters/bin/answer.py?answer=74288 News sitemap extension.
 *
 * @package Jetpack
 * @since 3.9.0
 * @since 4.8.0 Remove 1000 post limit.
 * @author Automattic
 */

require_once dirname( __FILE__ ) . '/sitemap-constants.php';
require_once dirname( __FILE__ ) . '/sitemap-buffer.php';
require_once dirname( __FILE__ ) . '/sitemap-stylist.php';
require_once dirname( __FILE__ ) . '/sitemap-librarian.php';
require_once dirname( __FILE__ ) . '/sitemap-finder.php';
require_once dirname( __FILE__ ) . '/sitemap-builder.php';

if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) {
	require_once dirname( __FILE__ ) . '/sitemap-logger.php';
}

/**
 * Governs the generation, storage, and serving of sitemaps.
 *
 * @since 4.8.0
 */
class Jetpack_Sitemap_Manager {

	/**
	 * @see Jetpack_Sitemap_Librarian
	 * @since 4.8.0
	 * @var Jetpack_Sitemap_Librarian $librarian Librarian object for storing and retrieving sitemap data.
	 */
	private $librarian;

	/**
	 * @see Jetpack_Sitemap_Logger
	 * @since 4.8.0
	 * @var Jetpack_Sitemap_Logger $logger Logger object for reporting debug messages.
	 */
	private $logger;

	/**
	 * @see Jetpack_Sitemap_Finder
	 * @since 4.8.0
	 * @var Jetpack_Sitemap_Finder $finder Finder object for dealing with sitemap URIs.
	 */
	private $finder;

	/**
	 * Construct a new Jetpack_Sitemap_Manager.
	 *
	 * @access public
	 * @since 4.8.0
	 */
	public function __construct() {
		$this->librarian = new Jetpack_Sitemap_Librarian();
		$this->finder = new Jetpack_Sitemap_Finder();

		if ( defined( 'WP_DEBUG' ) && ( true === WP_DEBUG ) ) {
			$this->logger = new Jetpack_Sitemap_Logger();
		}

		// Add callback for sitemap URL handler.
		add_action(
			'init',
			array( $this, 'callback_action_catch_sitemap_urls' ),
			defined( 'IS_WPCOM' ) && IS_WPCOM ? 100 : 10
		);

		// Add generator to wp_cron task list.
		$this->schedule_sitemap_generation();

		// Add sitemap to robots.txt.
		add_action(
			'do_robotstxt',
			array( $this, 'callback_action_do_robotstxt' ),
			20
		);

		// The news sitemap is cached; here we add a callback to
		// flush the cached news sitemap when a post is published.
		add_action(
			'publish_post',
			array( $this, 'callback_action_flush_news_sitemap_cache' ),
			10
		);

		// In case we need to purge all sitemaps, we do this.
		add_action(
			'jetpack_sitemaps_purge_data',
			array( $this, 'callback_action_purge_data' )
		);

		/*
		 * Module parameters are stored as options in the database.
		 * This allows us to avoid having to process all of init
		 * before serving the sitemap data. The following actions
		 * process and store these filters.
		 */

		// Process filters and store location string for sitemap.
		add_action(
			'init',
			array( $this, 'callback_action_filter_sitemap_location' ),
			999
		);

		return;
	}

	/**
	 * Echo a raw string of given content-type.
	 *
	 * @access private
	 * @since 4.8.0
	 *
	 * @param string $the_content_type The content type to be served.
	 * @param string $the_content The string to be echoed.
	 */
	private function serve_raw_and_die( $the_content_type, $the_content ) {
		header( 'Content-Type: ' . $the_content_type . '; charset=UTF-8' );

		global $wp_query;
		$wp_query->is_feed = true;
		set_query_var( 'feed', 'sitemap' );

		if ( '' === $the_content ) {
			wp_die(
				esc_html__( "No sitemap found. Maybe it's being generated. Please try again later.", 'jetpack' ),
				esc_html__( 'Sitemaps', 'jetpack' ),
				array(
					'response' => 404,
				)
			);
		}

		echo $the_content;

		die();
	}

	/**
	 * Callback to intercept sitemap url requests and serve sitemap files.
	 *
	 * @access public
	 * @since 4.8.0
	 */
	public function callback_action_catch_sitemap_urls() {
		// Regular expressions for sitemap URL routing.
		$regex = array(
			'master'        => '/^sitemap\.xml$/',
			'sitemap'       => '/^sitemap-[1-9][0-9]*\.xml$/',
			'index'         => '/^sitemap-index-[1-9][0-9]*\.xml$/',
			'sitemap-style' => '/^sitemap\.xsl$/',
			'index-style'   => '/^sitemap-index\.xsl$/',
			'image'         => '/^image-sitemap-[1-9][0-9]*\.xml$/',
			'image-index'   => '/^image-sitemap-index-[1-9][0-9]*\.xml$/',
			'image-style'   => '/^image-sitemap\.xsl$/',
			'video'         => '/^video-sitemap-[1-9][0-9]*\.xml$/',
			'video-index'   => '/^video-sitemap-index-[1-9][0-9]*\.xml$/',
			'video-style'   => '/^video-sitemap\.xsl$/',
			'news'          => '/^news-sitemap\.xml$/',
			'news-style'    => '/^news-sitemap\.xsl$/',
		);

		// The raw path(+query) of the requested URI.
		if ( isset( $_SERVER['REQUEST_URI'] ) ) { // WPCS: Input var okay.
			$raw_uri = sanitize_text_field(
				wp_unslash( $_SERVER['REQUEST_URI'] ) // WPCS: Input var okay.
			);
		} else {
			$raw_uri = '';
		}

		$request = $this->finder->recognize_sitemap_uri( $raw_uri );

		if ( isset( $request['sitemap_name'] ) ) {

			/**
			 * Filter the content type used to serve the sitemap XML files.
			 *
			 * @module sitemaps
			 *
			 * @since 3.9.0
			 *
			 * @param string $xml_content_type By default, it's 'text/xml'.
			 */
			$xml_content_type = apply_filters( 'jetpack_sitemap_content_type', 'text/xml' );

			// Catch master sitemap xml.
			if ( preg_match( $regex['master'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					$xml_content_type,
					$this->librarian->get_sitemap_text(
						jp_sitemap_filename( JP_MASTER_SITEMAP_TYPE, 0 ),
						JP_MASTER_SITEMAP_TYPE
					)
				);
			}

			// Catch sitemap xml.
			if ( preg_match( $regex['sitemap'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					$xml_content_type,
					$this->librarian->get_sitemap_text(
						$request['sitemap_name'],
						JP_PAGE_SITEMAP_TYPE
					)
				);
			}

			// Catch sitemap index xml.
			if ( preg_match( $regex['index'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					$xml_content_type,
					$this->librarian->get_sitemap_text(
						$request['sitemap_name'],
						JP_PAGE_SITEMAP_INDEX_TYPE
					)
				);
			}

			// Catch sitemap xsl.
			if ( preg_match( $regex['sitemap-style'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					'application/xml',
					Jetpack_Sitemap_Stylist::sitemap_xsl()
				);
			}

			// Catch sitemap index xsl.
			if ( preg_match( $regex['index-style'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					'application/xml',
					Jetpack_Sitemap_Stylist::sitemap_index_xsl()
				);
			}

			// Catch image sitemap xml.
			if ( preg_match( $regex['image'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					$xml_content_type,
					$this->librarian->get_sitemap_text(
						$request['sitemap_name'],
						JP_IMAGE_SITEMAP_TYPE
					)
				);
			}

			// Catch image sitemap index xml.
			if ( preg_match( $regex['image-index'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					$xml_content_type,
					$this->librarian->get_sitemap_text(
						$request['sitemap_name'],
						JP_IMAGE_SITEMAP_INDEX_TYPE
					)
				);
			}

			// Catch image sitemap xsl.
			if ( preg_match( $regex['image-style'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					'application/xml',
					Jetpack_Sitemap_Stylist::image_sitemap_xsl()
				);
			}

			// Catch video sitemap xml.
			if ( preg_match( $regex['video'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					$xml_content_type,
					$this->librarian->get_sitemap_text(
						$request['sitemap_name'],
						JP_VIDEO_SITEMAP_TYPE
					)
				);
			}

			// Catch video sitemap index xml.
			if ( preg_match( $regex['video-index'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					$xml_content_type,
					$this->librarian->get_sitemap_text(
						$request['sitemap_name'],
						JP_VIDEO_SITEMAP_INDEX_TYPE
					)
				);
			}

			// Catch video sitemap xsl.
			if ( preg_match( $regex['video-style'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					'application/xml',
					Jetpack_Sitemap_Stylist::video_sitemap_xsl()
				);
			}

			// Catch news sitemap xml.
			if ( preg_match( $regex['news'], $request['sitemap_name'] ) ) {
				$sitemap_builder = new Jetpack_Sitemap_Builder();
				$this->serve_raw_and_die(
					$xml_content_type,
					$sitemap_builder->news_sitemap_xml()
				);
			}

			// Catch news sitemap xsl.
			if ( preg_match( $regex['news-style'], $request['sitemap_name'] ) ) {
				$this->serve_raw_and_die(
					'application/xml',
					Jetpack_Sitemap_Stylist::news_sitemap_xsl()
				);
			}
		}

		// URL did not match any sitemap patterns.
		return;
	}

	/**
	 * Callback for adding sitemap-interval to the list of schedules.
	 *
	 * @access public
	 * @since 4.8.0
	 *
	 * @param array $schedules The array of WP_Cron schedules.
	 *
	 * @return array The updated array of WP_Cron schedules.
	 */
	public function callback_add_sitemap_schedule( $schedules ) {
		$schedules['sitemap-interval'] = array(
			'interval' => JP_SITEMAP_INTERVAL,
			'display'  => __( 'Sitemap Interval', 'jetpack' ),
		);
		return $schedules;
	}

	/**
	 * Add actions to schedule sitemap generation.
	 * Should only be called once, in the constructor.
	 *
	 * @access private
	 * @since 4.8.0
	 */
	private function schedule_sitemap_generation() {
		// Add cron schedule.
		add_filter( 'cron_schedules', array( $this, 'callback_add_sitemap_schedule' ) );

		$sitemap_builder = new Jetpack_Sitemap_Builder();

		add_action(
			'jp_sitemap_cron_hook',
			array( $sitemap_builder, 'update_sitemap' )
		);

		if ( ! wp_next_scheduled( 'jp_sitemap_cron_hook' ) ) {
			wp_schedule_event(
				time(),
				'sitemap-interval',
				'jp_sitemap_cron_hook'
			);
		}

		return;
	}

	/**
	 * Callback to add sitemap to robots.txt.
	 *
	 * @access public
	 * @since 4.8.0
	 */
	public function callback_action_do_robotstxt() {

		/**
		 * Filter whether to make the default sitemap discoverable to robots or not. Default true.
		 *
		 * @module sitemaps
		 * @since 3.9.0
		 *
		 * @param bool $discover_sitemap Make default sitemap discoverable to robots.
		 */
		$discover_sitemap = apply_filters( 'jetpack_sitemap_generate', true );

		if ( true === $discover_sitemap ) {
			$sitemap_url      = $this->finder->construct_sitemap_url( 'sitemap.xml' );
			echo 'Sitemap: ' . esc_url( $sitemap_url ) . "\n";
		}

		/**
		 * Filter whether to make the news sitemap discoverable to robots or not. Default true.
		 *
		 * @module sitemaps
		 * @since 3.9.0
		 *
		 * @param bool $discover_news_sitemap Make default news sitemap discoverable to robots.
		 */
		$discover_news_sitemap = apply_filters( 'jetpack_news_sitemap_generate', true );

		if ( true === $discover_news_sitemap ) {
			$news_sitemap_url = $this->finder->construct_sitemap_url( 'news-sitemap.xml' );
			echo 'Sitemap: ' . esc_url( $news_sitemap_url ) . "\n";
		}

		return;
	}

	/**
	 * Callback to delete the news sitemap cache.
	 *
	 * @access public
	 * @since 4.8.0
	 */
	public function callback_action_flush_news_sitemap_cache() {
		delete_transient( 'jetpack_news_sitemap_xml' );
	}

	/**
	 * Callback for resetting stored sitemap data.
	 *
	 * @access public
	 * @since 5.3.0
	 */
	public function callback_action_purge_data() {
		$this->callback_action_flush_news_sitemap_cache();
		$this->librarian->delete_all_stored_sitemap_data();
	}

	/**
	 * Callback to set the sitemap location.
	 *
	 * @access public
	 * @since 4.8.0
	 */
	public function callback_action_filter_sitemap_location() {
		update_option(
			'jetpack_sitemap_location',
			/**
			 * Additional path for sitemap URIs. Default value is empty.
			 *
			 * This string is any additional path fragment you want included between
			 * the home URL and the sitemap filenames. Exactly how this fragment is
			 * interpreted depends on your permalink settings. For example:
			 *
			 *   Pretty permalinks:
			 *     home_url() . jetpack_sitemap_location . '/sitemap.xml'
			 *
			 *   Plain ("ugly") permalinks:
			 *     home_url() . jetpack_sitemap_location . '/?jetpack-sitemap=sitemap.xml'
			 *
			 *   PATHINFO permalinks:
			 *     home_url() . '/index.php' . jetpack_sitemap_location . '/sitemap.xml'
			 *
			 * where 'sitemap.xml' is the name of a specific sitemap file.
			 * The value of this filter must be a valid path fragment per RFC 3986;
			 * in particular it must either be empty or begin with a '/'.
			 * Also take care that any restrictions on sitemap location imposed by
			 * the sitemap protocol are satisfied.
			 *
			 * The result of this filter is stored in an option, 'jetpack_sitemap_location';
			 * that option is what gets read when the sitemap location is needed.
			 * This way we don't have to wait for init to finish before building sitemaps.
			 *
			 * @link https://tools.ietf.org/html/rfc3986#section-3.3 RFC 3986
			 * @link http://www.sitemaps.org/ The sitemap protocol
			 *
			 * @since 4.8.0
			 */
			apply_filters(
				'jetpack_sitemap_location',
				''
			)
		);

		return;
	}

} // End Jetpack_Sitemap_Manager class.

new Jetpack_Sitemap_Manager();

/**
 * Absolute URL of the current blog's sitemap.
 *
 * @module sitemaps
 *
 * @since  3.9.0
 * @since  4.8.1 Code uses method found in Jetpack_Sitemap_Finder::construct_sitemap_url in 4.8.0.
 *                It has been moved here to avoid fatal errors with other plugins that were expecting to find this function.
 *
 * @param string $filename Sitemap file name. Defaults to 'sitemap.xml', the initial sitemaps page.
 *
 * @return string Sitemap URL.
 */
function jetpack_sitemap_uri( $filename = 'sitemap.xml' ) {
	global $wp_rewrite;

	$location = Jetpack_Options::get_option_and_ensure_autoload( 'jetpack_sitemap_location', '' );

	if ( $wp_rewrite->using_index_permalinks() ) {
		$sitemap_url = home_url( '/index.php' . $location . '/' . $filename );
	} elseif ( $wp_rewrite->using_permalinks() ) {
		$sitemap_url = home_url( $location . '/' . $filename );
	} else {
		$sitemap_url = home_url( $location . '/?jetpack-sitemap=' . $filename );
	}

	/**
	 * Filter sitemap URL relative to home URL.
	 *
	 * @module sitemaps
	 *
	 * @since 3.9.0
	 *
	 * @param string $sitemap_url Sitemap URL.
	 */
	return apply_filters( 'jetpack_sitemap_location', $sitemap_url );
}