PHP Regular Expressions
Introduction to PHP Regular Expressions
PHP uses the PCRE (Perl Compatible Regular Expressions) library, providing powerful pattern matching capabilities for text processing, validation, and manipulation.
Basic Pattern Matching
preg_match() - Single Match
<?php
// Basic pattern matching
$text = "The year is 2023";
$pattern = '/\d{4}/'; // Match 4 digits
if (preg_match($pattern, $text, $matches)) {
echo "Found year: " . $matches[0]; // 2023
}
// Email validation
$email = "[email protected]";
$emailPattern = '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/';
if (preg_match($emailPattern, $email)) {
echo "Valid email address";
} else {
echo "Invalid email address";
}
// Extract domain from URL
$url = "https://www.example.com/path";
$domainPattern = '/https?:\/\/(?:www\.)?([^\/]+)/';
if (preg_match($domainPattern, $url, $matches)) {
echo "Domain: " . $matches[1]; // example.com
}
?>
preg_match_all() - Multiple Matches
<?php
// Find all matches
$text = "Contact us at [email protected] or [email protected]";
$emailPattern = '/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/';
$matches = [];
$count = preg_match_all($emailPattern, $text, $matches);
echo "Found $count email(s):\n";
foreach ($matches[0] as $email) {
echo "- $email\n";
}
// Extract all numbers with context
$text = "Price: $19.99, Discount: 15%, Tax: $2.50";
$pricePattern = '/\$(\d+\.\d{2})/';
preg_match_all($pricePattern, $text, $matches, PREG_SET_ORDER);
foreach ($matches as $match) {
echo "Price found: $" . $match[1] . "\n";
}
?>
Search and Replace
preg_replace() - Replace Patterns
<?php
// Simple replacement
$text = "Hello world, wonderful world!";
$pattern = '/world/';
$replacement = 'universe';
$result = preg_replace($pattern, $replacement, $text);
echo $result; // "Hello universe, wonderful universe!"
// Case-insensitive replacement
$text = "PHP is great. php rocks!";
$pattern = '/php/i'; // 'i' flag for case-insensitive
$replacement = 'JavaScript';
$result = preg_replace($pattern, $replacement, $text);
echo $result; // "JavaScript is great. JavaScript rocks!"
// Multiple patterns and replacements
$text = "Today is 2023-12-25";
$patterns = ['/(\d{4})-(\d{2})-(\d{2})/', '/\bToday\b/'];
$replacements = ['$3/$2/$1', 'The date'];
$result = preg_replace($patterns, $replacements, $text);
echo $result; // "The date is 25/12/2023"
// Using callbacks for complex replacements
$text = "Temperature: 32F, 100F, 212F";
$pattern = '/(\d+)F/';
$result = preg_replace_callback($pattern, function($matches) {
$fahrenheit = $matches[1];
$celsius = round(($fahrenheit - 32) * 5/9, 1);
return $matches[1] . "F ({$celsius}C)";
}, $text);
echo $result; // "Temperature: 32F (0C), 100F (37.8C), 212F (100C)"
?>
preg_replace_callback() - Advanced Replacements
<?php
// URL shortener example
$text = "Visit https://www.verylongdomainname.com/very/long/path/to/page for more info";
$urlPattern = '/https?:\/\/[^\s]+/';
$result = preg_replace_callback($urlPattern, function($matches) {
$url = $matches[0];
$hash = substr(md5($url), 0, 8);
return "short.ly/$hash";
}, $text);
echo $result; // "Visit short.ly/a1b2c3d4 for more info"
// Markdown-like bold text converter
$text = "This is **bold** and this is **also bold**.";
$pattern = '/\*\*(.*?)\*\*/';
$result = preg_replace_callback($pattern, function($matches) {
return '<strong>' . $matches[1] . '</strong>';
}, $text);
echo $result; // "This is <strong>bold</strong> and this is <strong>also bold</strong>."
// Template variable replacement
$template = "Hello {{name}}, your balance is {{balance|currency}}.";
$data = ['name' => 'John', 'balance' => 1234.56];
$result = preg_replace_callback('/\{\{(\w+)(\|(\w+))?\}\}/', function($matches) use ($data) {
$variable = $matches[1];
$filter = $matches[3] ?? null;
if (!isset($data[$variable])) {
return $matches[0]; // Return original if variable not found
}
$value = $data[$variable];
// Apply filter
switch ($filter) {
case 'currency':
return '$' . number_format($value, 2);
default:
return $value;
}
}, $template);
echo $result; // "Hello John, your balance is $1,234.56."
?>
Text Validation
Common Validation Patterns
<?php
class Validator {
public static function email($email) {
$pattern = '/^[a-zA-Z0-9.!#$%&\'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/';
return preg_match($pattern, $email) === 1;
}
public static function phone($phone) {
// Matches: (123) 456-7890, 123-456-7890, 123.456.7890, 123 456 7890
$pattern = '/^(\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}$/';
return preg_match($pattern, $phone) === 1;
}
public static function url($url) {
$pattern = '/^https?:\/\/(?:[-\w.])+(?:\:[0-9]+)?(?:\/(?:[\w\/_.])*(?:\?(?:[\w&=%.])*)?(?:\#(?:[\w.])*)?)?$/';
return preg_match($pattern, $url) === 1;
}
public static function ipAddress($ip) {
$pattern = '/^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/';
return preg_match($pattern, $ip) === 1;
}
public static function creditCard($number) {
// Remove spaces and dashes
$number = preg_replace('/[\s-]/', '', $number);
// Basic format check (13-19 digits)
if (!preg_match('/^\d{13,19}$/', $number)) {
return false;
}
// Luhn algorithm check
return self::luhnCheck($number);
}
public static function strongPassword($password) {
// At least 8 chars, 1 uppercase, 1 lowercase, 1 digit, 1 special char
$pattern = '/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/';
return preg_match($pattern, $password) === 1;
}
private static function luhnCheck($number) {
$sum = 0;
$alternate = false;
for ($i = strlen($number) - 1; $i >= 0; $i--) {
$digit = intval($number[$i]);
if ($alternate) {
$digit *= 2;
if ($digit > 9) {
$digit = ($digit % 10) + 1;
}
}
$sum += $digit;
$alternate = !$alternate;
}
return ($sum % 10) === 0;
}
}
// Usage examples
$tests = [
'email' => ['[email protected]', 'invalid.email'],
'phone' => ['(123) 456-7890', '123-456-7890', '12345'],
'url' => ['https://example.com', 'not-a-url'],
'ip' => ['192.168.1.1', '999.999.999.999'],
'password' => ['StrongP@ss1', 'weak']
];
foreach ($tests as $type => $examples) {
echo "Testing $type:\n";
foreach ($examples as $example) {
$method = [$type === 'password' ? 'strongPassword' : $type];
$result = Validator::$method($example) ? 'Valid' : 'Invalid';
echo " $example: $result\n";
}
echo "\n";
}
?>
Text Extraction and Parsing
Data Extraction
<?php
// Extract structured data from text
class DataExtractor {
public static function extractDates($text) {
// Matches: 2023-12-25, 12/25/2023, Dec 25, 2023
$patterns = [
'/\b(\d{4})-(\d{1,2})-(\d{1,2})\b/', // ISO format
'/\b(\d{1,2})\/(\d{1,2})\/(\d{4})\b/', // US format
'/\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+(\d{1,2}),?\s+(\d{4})\b/i' // Text format
];
$dates = [];
foreach ($patterns as $pattern) {
preg_match_all($pattern, $text, $matches, PREG_SET_ORDER);
$dates = array_merge($dates, $matches);
}
return $dates;
}
public static function extractPrices($text) {
$pattern = '/\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)/';
preg_match_all($pattern, $text, $matches);
return $matches[1]; // Return just the numeric parts
}
public static function extractHashtags($text) {
$pattern = '/#([a-zA-Z0-9_]+)/';
preg_match_all($pattern, $text, $matches);
return $matches[1];
}
public static function extractMentions($text) {
$pattern = '/@([a-zA-Z0-9_]+)/';
preg_match_all($pattern, $text, $matches);
return $matches[1];
}
public static function extractWords($text, $minLength = 3) {
$pattern = '/\b[a-zA-Z]{' . $minLength . ',}\b/';
preg_match_all($pattern, $text, $matches);
return array_unique(array_map('strtolower', $matches[0]));
}
}
// Example usage
$text = "
Event on Dec 25, 2023 or 2023-12-31.
Tickets cost $29.99 or $150.00.
Follow @speaker and use #conference #tech hashtags.
Contact [email protected] for details.
";
echo "Dates found:\n";
print_r(DataExtractor::extractDates($text));
echo "\nPrices found:\n";
print_r(DataExtractor::extractPrices($text));
echo "\nHashtags found:\n";
print_r(DataExtractor::extractHashtags($text));
echo "\nMentions found:\n";
print_r(DataExtractor::extractMentions($text));
?>
Log File Parsing
<?php
// Parse Apache/Nginx access logs
class LogParser {
public static function parseAccessLog($logLine) {
// Common Log Format: IP - - [timestamp] "method path protocol" status size
$pattern = '/^(\S+) \S+ \S+ \[([^\]]+)\] "(\S+) ([^\s]+) ([^"]+)" (\d+) (\S+)/';
if (preg_match($pattern, $logLine, $matches)) {
return [
'ip' => $matches[1],
'timestamp' => $matches[2],
'method' => $matches[3],
'path' => $matches[4],
'protocol' => $matches[5],
'status' => intval($matches[6]),
'size' => $matches[7] === '-' ? 0 : intval($matches[7])
];
}
return null;
}
public static function parseErrorLog($logLine) {
// PHP error log format
$pattern = '/^\[([^\]]+)\] ([^:]+): (.+) in (.+) on line (\d+)$/';
if (preg_match($pattern, $logLine, $matches)) {
return [
'timestamp' => $matches[1],
'level' => $matches[2],
'message' => $matches[3],
'file' => $matches[4],
'line' => intval($matches[5])
];
}
return null;
}
public static function extractBotTraffic($logLines) {
$botPattern = '/bot|spider|crawler|scraper|curl|wget/i';
$botLogs = [];
foreach ($logLines as $line) {
if (preg_match($botPattern, $line)) {
$parsed = self::parseAccessLog($line);
if ($parsed) {
$botLogs[] = $parsed;
}
}
}
return $botLogs;
}
}
// Example log parsing
$accessLog = '192.168.1.1 - - [25/Dec/2023:10:00:00 +0000] "GET /index.php HTTP/1.1" 200 1234';
$errorLog = '[25-Dec-2023 10:00:00 UTC] PHP Parse error: syntax error in /var/www/test.php on line 42';
$accessData = LogParser::parseAccessLog($accessLog);
$errorData = LogParser::parseErrorLog($errorLog);
print_r($accessData);
print_r($errorData);
?>
Advanced Patterns
Lookaheads and Lookbehinds
<?php
// Positive lookahead (?=...)
$text = "password123 and secretpass and mypassword456";
$pattern = '/\w+(?=\d)/'; // Words followed by digits
preg_match_all($pattern, $text, $matches);
print_r($matches[0]); // ['password', 'mypassword']
// Negative lookahead (?!...)
$text = "user123 admin guest456 root";
$pattern = '/\w+(?!\d)/'; // Words NOT followed by digits
preg_match_all($pattern, $text, $matches);
print_r($matches[0]); // ['admin', 'root']
// Positive lookbehind (?<=...)
$text = "$100 €50 £30 ¥200";
$pattern = '/(?<=\$)\d+/'; // Numbers preceded by dollar sign
preg_match_all($pattern, $text, $matches);
print_r($matches[0]); // ['100']
// Negative lookbehind (?<!...)
$text = "123abc 456def 789xyz";
$pattern = '/(?<!\d)[a-z]+/'; // Letters NOT preceded by digits
preg_match_all($pattern, $text, $matches);
print_r($matches[0]); // []
// Complex validation: password with requirements
function validateComplexPassword($password) {
$patterns = [
'/(?=.*[a-z])/', // Must contain lowercase
'/(?=.*[A-Z])/', // Must contain uppercase
'/(?=.*\d)/', // Must contain digit
'/(?=.*[!@#$%^&*])/', // Must contain special char
'/^.{8,}$/', // At least 8 characters
'/^(?!.*(.)\1{2})/', // No more than 2 consecutive same chars
];
foreach ($patterns as $pattern) {
if (!preg_match($pattern, $password)) {
return false;
}
}
return true;
}
echo validateComplexPassword('StrongP@ss1') ? 'Valid' : 'Invalid'; // Valid
echo validateComplexPassword('weakpass') ? 'Valid' : 'Invalid'; // Invalid
?>
Named Capture Groups
<?php
// Named groups for better readability
$text = "John Doe ([email protected])";
$pattern = '/(?P<name>[A-Za-z\s]+) \((?P<email>[^)]+)\)/';
if (preg_match($pattern, $text, $matches)) {
echo "Name: " . $matches['name'] . "\n"; // John Doe
echo "Email: " . $matches['email'] . "\n"; // [email protected]
}
// Extract structured data from URLs
$url = "https://example.com/user/123/profile?tab=settings";
$pattern = '/^(?P<scheme>https?):\/\/(?P<host>[^\/]+)\/(?P<path>.+?)(?:\?(?P<query>.+))?$/';
if (preg_match($pattern, $url, $matches)) {
echo "Scheme: " . ($matches['scheme'] ?? 'N/A') . "\n";
echo "Host: " . ($matches['host'] ?? 'N/A') . "\n";
echo "Path: " . ($matches['path'] ?? 'N/A') . "\n";
echo "Query: " . ($matches['query'] ?? 'N/A') . "\n";
}
// Parse configuration files
$config = "database.host=localhost\ndatabase.port=3306\napp.debug=true";
$pattern = '/^(?P<section>\w+)\.(?P<key>\w+)=(?P<value>.+)$/m';
preg_match_all($pattern, $config, $matches, PREG_SET_ORDER);
$configArray = [];
foreach ($matches as $match) {
$configArray[$match['section']][$match['key']] = $match['value'];
}
print_r($configArray);
/*
Array (
[database] => Array (
[host] => localhost
[port] => 3306
)
[app] => Array (
[debug] => true
)
)
*/
?>
Text Processing Utilities
Text Cleaning and Formatting
<?php
class TextProcessor {
public static function cleanWhitespace($text) {
// Remove extra whitespace and normalize line endings
$text = preg_replace('/\r\n|\r/', "\n", $text); // Normalize line endings
$text = preg_replace('/[ \t]+/', ' ', $text); // Multiple spaces to single
$text = preg_replace('/\n\s*\n/', "\n", $text); // Multiple newlines to single
return trim($text);
}
public static function stripHtmlTags($html, $allowedTags = []) {
if (empty($allowedTags)) {
return preg_replace('/<[^>]*>/', '', $html);
}
$allowed = implode('|', array_map('preg_quote', $allowedTags));
$pattern = "/<(?!\/?(?:$allowed)\b)[^>]*>/i";
return preg_replace($pattern, '', $html);
}
public static function extractTextBetween($text, $start, $end) {
$startQuoted = preg_quote($start, '/');
$endQuoted = preg_quote($end, '/');
$pattern = "/$startQuoted(.*?)$endQuoted/s";
preg_match_all($pattern, $text, $matches);
return $matches[1];
}
public static function highlightKeywords($text, $keywords, $highlightClass = 'highlight') {
$escapedKeywords = array_map('preg_quote', $keywords);
$pattern = '/\b(' . implode('|', $escapedKeywords) . ')\b/i';
return preg_replace($pattern, '<span class="' . $highlightClass . '">$1</span>', $text);
}
public static function truncateWords($text, $wordLimit, $suffix = '...') {
$words = preg_split('/\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
if (count($words) <= $wordLimit) {
return $text;
}
return implode(' ', array_slice($words, 0, $wordLimit)) . $suffix;
}
public static function generateSlug($text) {
// Convert to lowercase
$slug = strtolower($text);
// Replace non-alphanumeric characters with hyphens
$slug = preg_replace('/[^a-z0-9]+/', '-', $slug);
// Remove leading/trailing hyphens
$slug = trim($slug, '-');
// Remove multiple consecutive hyphens
$slug = preg_replace('/-+/', '-', $slug);
return $slug;
}
}
// Example usage
$html = '<p>This is <strong>bold</strong> and <em>italic</em> text with <script>alert("xss")</script>.</p>';
$cleaned = TextProcessor::stripHtmlTags($html, ['strong', 'em']);
echo $cleaned . "\n"; // "This is <strong>bold</strong> and <em>italic</em> text with ."
$text = "PHP is a programming language. PHP is widely used for web development.";
$highlighted = TextProcessor::highlightKeywords($text, ['PHP', 'programming']);
echo $highlighted . "\n";
$title = "How to Learn PHP: A Comprehensive Guide!";
$slug = TextProcessor::generateSlug($title);
echo $slug . "\n"; // "how-to-learn-php-a-comprehensive-guide"
?>
Performance and Best Practices
Regex Optimization
<?php
class RegexOptimizer {
// Use non-capturing groups when you don't need the captured content
public static function optimizedEmailValidation($email) {
// Instead of: /([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)\.([a-zA-Z]{2,})/
// Use:
$pattern = '/(?:[a-zA-Z0-9._%+-]+)@(?:[a-zA-Z0-9.-]+)\.(?:[a-zA-Z]{2,})/';
return preg_match($pattern, $email);
}
// Compile patterns once for repeated use
private static $compiledPatterns = [];
public static function getCompiledPattern($pattern) {
if (!isset(self::$compiledPatterns[$pattern])) {
// Validate pattern
if (@preg_match($pattern, '') === false) {
throw new InvalidArgumentException("Invalid regex pattern: $pattern");
}
self::$compiledPatterns[$pattern] = $pattern;
}
return self::$compiledPatterns[$pattern];
}
// Benchmark regex performance
public static function benchmarkPattern($pattern, $text, $iterations = 10000) {
$start = microtime(true);
for ($i = 0; $i < $iterations; $i++) {
preg_match($pattern, $text);
}
$end = microtime(true);
return ($end - $start) * 1000; // Return milliseconds
}
// Alternative: use strpos for simple string searches
public static function compareStringSearch($text, $searchTerm) {
$regexTime = self::benchmarkPattern('/' . preg_quote($searchTerm) . '/', $text);
$start = microtime(true);
for ($i = 0; $i < 10000; $i++) {
strpos($text, $searchTerm) !== false;
}
$strposTime = (microtime(true) - $start) * 1000;
return [
'regex' => $regexTime,
'strpos' => $strposTime,
'faster' => $strposTime < $regexTime ? 'strpos' : 'regex'
];
}
}
// Security: Prevent ReDoS (Regular Expression Denial of Service)
class SafeRegex {
private static $maxExecutionTime = 1000; // 1 second in milliseconds
public static function safeMatch($pattern, $subject, &$matches = null) {
$start = microtime(true);
// Set PCRE limits
ini_set('pcre.backtrack_limit', 100000);
ini_set('pcre.recursion_limit', 100000);
$result = @preg_match($pattern, $subject, $matches);
$executionTime = (microtime(true) - $start) * 1000;
if ($result === false) {
throw new RuntimeException('Regex execution failed');
}
if ($executionTime > self::$maxExecutionTime) {
throw new RuntimeException('Regex execution timeout');
}
return $result;
}
public static function validatePattern($pattern) {
// Check for potentially dangerous patterns
$dangerousPatterns = [
'/\(\?\=.*\)\+/', // Catastrophic backtracking
'/\(\?\!\)/', // Empty negative lookahead
'/\(\.\*\)\+/', // Nested quantifiers
];
foreach ($dangerousPatterns as $dangerous) {
if (preg_match($dangerous, $pattern)) {
return false;
}
}
return true;
}
}
// Example usage
$pattern = '/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/';
$email = '[email protected]';
try {
$result = SafeRegex::safeMatch($pattern, $email);
echo $result ? 'Valid email' : 'Invalid email';
} catch (RuntimeException $e) {
echo 'Regex error: ' . $e->getMessage();
}
?>
PHP's regular expressions provide powerful text processing capabilities, but should be used judiciously with attention to performance and security considerations.