الحصول على قطع الغيار من URL (Regex)

https://stackoverflow.com/questions/27745

09-06-2019
|

سؤال

نظرا URL (سطر واحد):
http://test.example.com/dir/subdir/file.html

كيف يمكنني استخراج الأجزاء التالية باستخدام التعبيرات العادية:

فرعي (اختبار)
المجال (example.com)
المسار دون الملف (/dir/subdir/)
الملف (file.html)
مسار مع الملف (/dir/subdir/file.html)
URL دون المسار (http://test.example.com)
(إضافة أي الأخرى التي تعتقد أنها ستكون مفيدة)

على regex يجب أن تعمل بشكل صحيح حتى لو كنت أدخل الرابط التالي:

http://example.example.com/example/example/example.html

المحلول

واحد regex تحليل تفكك a URL الكامل بما في ذلك معلمات الاستعلام و المراسي مثلا

https://www.google.com/dir/1/2/search.html?arg=0-a&arg1=1-b&arg3-c#hash

^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$

RexEx المواقف:

url:RegExp['$&'],

البروتوكول:RegExp.$2,

المضيف:RegExp.$3,

المسار:RegExp.$4,

الملف:RegExp.$6,

الاستعلام:RegExp.$7,

التجزئة:RegExp.$8

ثم هل يمكن مزيد من تحليل المضيف ('.' محدد) بسهولة تامة.

ما أنا لا تستخدم شيئا مثل هذا:

/*
    ^(.*:)//([A-Za-z0-9\-\.]+)(:[0-9]+)?(.*)$
*/
proto $1
host $2
port $3
the-rest $4

مواصلة تحليل بقية' أن تكون محددة قدر الإمكان.يفعل ذلك في regex هو مجنون قليلا.

نصائح أخرى

وأنا أدرك أنا في وقت متأخر إلى الحزب ، ولكن هناك طريقة بسيطة للسماح المتصفح parse url لك دون regex:

var a = document.createElement('a');
a.href = 'http://www.example.com:123/foo/bar.html?fox=trot#foo';

['href','protocol','host','hostname','port','pathname','search','hash'].forEach(function(k) {
    console.log(k+':', a[k]);
});

/*//Output:
href: http://www.example.com:123/foo/bar.html?fox=trot#foo
protocol: http:
host: www.example.com:123
hostname: www.example.com
port: 123
pathname: /foo/bar.html
search: ?fox=trot
hash: #foo
*/

أنا بضع سنوات في وقت متأخر إلى الحزب ، ولكن أنا مندهش أنه لا أحد قد ذكر معرف الموارد الموحد مواصفات لديه قسم تحليل محددات مع التعبير العادي.التعبير العادي ، الذي كتبه بيرنرز لي ، et al. ، هو:

^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
 12            3  4          5       6  7        8 9
الأرقام في السطر الثاني أعلاه هي فقط لمساعدة القراءة ؛ فإنها تشير إلى نقاط مرجعية لكل subexpression (أي كل تقرن بين قوسين).نشير إلى قيمة مطابقة على subexpression دولار.على سبيل المثال ، مطابقة فوق التعبير

http://www.ics.uci.edu/pub/ietf/uri/#Related

النتائج التالية subexpression المباريات:
$1 = http:
$2 = http
$3 = //www.ics.uci.edu
$4 = www.ics.uci.edu
$5 = /pub/ietf/uri/
$6 = <undefined>
$7 = <undefined>
$8 = #Related
$9 = Related

ما يستحق ، وجدت أنني قد للهروب إلى الأمام مائلة في جافا سكريبت:

^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?

وجدت أعلى صوت الإجابة (hometoast الجواب) لا يعمل تماما بالنسبة لي.مشكلتين:

فإنه لا يمكن التعامل مع رقم المنفذ.
تجزئة الجزء المكسور.

ما يلي هو نسخة معدلة:

^((http[s]?|ftp):\/)?\/?([^:\/\s]+)(:([^\/]*))?((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(\?([^#]*))?(#(.*))?$

موقف أجزاء هي كما يلي:

int SCHEMA = 2, DOMAIN = 3, PORT = 5, PATH = 6, FILE = 8, QUERYSTRING = 9, HASH = 12

تحرير نشرت من قبل مستخدم حالا:

function getFileName(path) {
    return path.match(/^((http[s]?|ftp):\/)?\/?([^:\/\s]+)(:([^\/]*))?((\/[\w\/-]+)*\/)([\w\-\.]+[^#?\s]+)(\?([^#]*))?(#(.*))?$/i)[8];
}

أنا في حاجة إلى التعبير العادي لتتناسب مع جميع عناوين url وجعل هذا واحد:

/(?:([^\:]*)\:\/\/)?(?:([^\:\@]*)(?:\:([^\@]*))?\@)?(?:([^\/\:]*)\.(?=[^\.\/\:]*\.[^\.\/\:]*))?([^\.\/\:]*)(?:\.([^\/\.\:]*))?(?:\:([0-9]*))?(\/[^\?#]*(?=.*?\/)\/)?([^\?#]*)?(?:\?([^#]*))?(?:#(.*))?/

يطابق كل عناوين المواقع ، أي بروتوكول عناوين مثل

ftp://user:pass@www.cs.server.com:8080/dir1/dir2/file.php?param1=value1#hashtag

النتيجة (في جافا سكريبت) يبدو مثل هذا:

["ftp", "user", "pass", "www.cs", "server", "com", "8080", "/dir1/dir2/", "file.php", "param1=value1", "hashtag"]

رابط مثل

mailto://admin@www.cs.server.com

يبدو مثل هذا:

["mailto", "admin", undefined, "www.cs", "server", "com", undefined, undefined, undefined, undefined, undefined]

كنت أحاول حل هذه في جافا سكريبت ، التي ينبغي التعامل معها من قبل:

var url = new URL('http://a:b@example.com:890/path/wah@t/foo.js?foo=bar&bingobang=&king=kong@kong.com#foobar/bing/bo@ng?bang');

منذ (في كروم ، على الأقل) أن يوزع إلى:

{
  "hash": "#foobar/bing/bo@ng?bang",
  "search": "?foo=bar&bingobang=&king=kong@kong.com",
  "pathname": "/path/wah@t/foo.js",
  "port": "890",
  "hostname": "example.com",
  "host": "example.com:890",
  "password": "b",
  "username": "a",
  "protocol": "http:",
  "origin": "http://example.com:890",
  "href": "http://a:b@example.com:890/path/wah@t/foo.js?foo=bar&bingobang=&king=kong@kong.com#foobar/bing/bo@ng?bang"
}

ومع ذلك ، فإن هذا ليس عبر متصفح (https://developer.mozilla.org/en-US/docs/Web/API/URL) لذا مرقع معا لسحب نفس أجزاء على النحو الوارد أعلاه:

^(?:(?:(([^:\/#\?]+:)?(?:(?:\/\/)(?:(?:(?:([^:@\/#\?]+)(?:\:([^:@\/#\?]*))?)@)?(([^:\/#\?\]\[]+|\[[^\/\]@#?]+\])(?:\:([0-9]+))?))?)?)?((?:\/?(?:[^\/\?#]+\/+)*)(?:[^\?#]*)))?(\?[^#]+)?)(#.*)?

الفضل في هذا regex يذهب إلى https://gist.github.com/rpflorence الذي نشر هذا jsperf http://jsperf.com/url-parsing (الأصل العثور عليها هنا: https://gist.github.com/jlong/2428561#comment-310066) الذي جاء مع regex كان هذا في الأصل على أساس.

الأجزاء في هذا النظام:

var keys = [
    "href",                    // http://user:pass@host.com:81/directory/file.ext?query=1#anchor
    "origin",                  // http://user:pass@host.com:81
    "protocol",                // http:
    "username",                // user
    "password",                // pass
    "host",                    // host.com:81
    "hostname",                // host.com
    "port",                    // 81
    "pathname",                // /directory/file.ext
    "search",                  // ?query=1
    "hash"                     // #anchor
];

وهناك أيضا مكتبة صغيرة التي يلتف عليه و يوفر الاستعلام params:

https://github.com/sadams/lite-url (متاحة أيضا على باور)

إذا كنت قد تحسن ، يرجى إنشاء طلب سحب مع المزيد من الاختبارات و سوف تقبل الدمج مع الشكر.

اقتراح أكثر قابلية للقراءة الحل (في بيثون ، ولكن ينطبق على أي regex):

def url_path_to_dict(path):
    pattern = (r'^'
               r'((?P<schema>.+?)://)?'
               r'((?P<user>.+?)(:(?P<password>.*?))?@)?'
               r'(?P<host>.*?)'
               r'(:(?P<port>\d+?))?'
               r'(?P<path>/.*?)?'
               r'(?P<query>[?].*?)?'
               r'$'
               )
    regex = re.compile(pattern)
    m = regex.match(path)
    d = m.groupdict() if m is not None else None

    return d

def main():
    print url_path_to_dict('http://example.example.com/example/example/example.html')

طباعة:

{
'host': 'example.example.com', 
'user': None, 
'path': '/example/example/example.html', 
'query': None, 
'password': None, 
'port': None, 
'schema': 'http'
}

الفرعي المجال صعبة لأن فرعي يمكن أن يكون لها عدة أجزاء ، كما أن نطاق المستوى الأعلى ، http://sub1.sub2.domain.co.uk/

 the path without the file : http://[^/]+/((?:[^/]+/)*(?:[^/]+$)?)  
 the file : http://[^/]+/(?:[^/]+/)*((?:[^/.]+\.)+[^/.]+)$  
 the path with the file : http://[^/]+/(.*)  
 the URL without the path : (http://[^/]+/)

(تخفيض السعر ليست ودية للغاية أن regexes)

هذه نسخة محسنة يجب العمل بشكل موثوق كما محلل.

   // Applies to URI, not just URL or URN:
   //    http://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Relationship_to_URL_and_URN
   //
   // http://labs.apache.org/webarch/uri/rfc/rfc3986.html#regexp
   //
   // (?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?
   //
   // http://en.wikipedia.org/wiki/URI_scheme#Generic_syntax
   //
   // $@ matches the entire uri
   // $1 matches scheme (ftp, http, mailto, mshelp, ymsgr, etc)
   // $2 matches authority (host, user:pwd@host, etc)
   // $3 matches path
   // $4 matches query (http GET REST api, etc)
   // $5 matches fragment (html anchor, etc)
   //
   // Match specific schemes, non-optional authority, disallow white-space so can delimit in text, and allow 'www.' w/o scheme
   // Note the schemes must match ^[^\s|:/?#]+(?:\|[^\s|:/?#]+)*$
   //
   // (?:()(www\.[^\s/?#]+\.[^\s/?#]+)|(schemes)://([^\s/?#]*))([^\s?#]*)(?:\?([^\s#]*))?(#(\S*))?
   //
   // Validate the authority with an orthogonal RegExp, so the RegExp above won’t fail to match any valid urls.
   function uriRegExp( flags, schemes/* = null*/, noSubMatches/* = false*/ )
   {
      if( !schemes )
         schemes = '[^\\s:\/?#]+'
      else if( !RegExp( /^[^\s|:\/?#]+(?:\|[^\s|:\/?#]+)*$/ ).test( schemes ) )
         throw TypeError( 'expected URI schemes' )
      return noSubMatches ? new RegExp( '(?:www\\.[^\\s/?#]+\\.[^\\s/?#]+|' + schemes + '://[^\\s/?#]*)[^\\s?#]*(?:\\?[^\\s#]*)?(?:#\\S*)?', flags ) :
         new RegExp( '(?:()(www\\.[^\\s/?#]+\\.[^\\s/?#]+)|(' + schemes + ')://([^\\s/?#]*))([^\\s?#]*)(?:\\?([^\\s#]*))?(?:#(\\S*))?', flags )
   }

   // http://en.wikipedia.org/wiki/URI_scheme#Official_IANA-registered_schemes
   function uriSchemesRegExp()
   {
      return 'about|callto|ftp|gtalk|http|https|irc|ircs|javascript|mailto|mshelp|sftp|ssh|steam|tel|view-source|ymsgr'
   }

جرب ما يلي:

^((ht|f)tp(s?)\:\/\/|~/|/)?([\w]+:\w+@)?([a-zA-Z]{1}([\w\-]+\.)+([\w]{2,5}))(:[\d]{1,5})?((/?\w+/)+|/?)(\w+\.[\w]{3,4})?((\?\w+=\w+)?(&\w+=\w+)*)?

وهو يدعم HTTP / FTP, نطاقات فرعية, المجلدات, الملفات الخ.

لقد وجدت أنه من جوجل للبحث سريع:

http://geekswithblogs.net/casualjim/archive/2005/12/01/61722.aspx

/^((?P<scheme>https?|ftp):\/)?\/?((?P<username>.*?)(:(?P<password>.*?)|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<filename>[-\w.]+[^#?\s]*)?(?P<query>\?([^#]*))?(?P<fragment>#(.*))?$/

من جوابي على مثل هذا السؤال.يعمل بشكل أفضل من بعض من الآخرين ذكر لأن لديهم بعض الأخطاء (مثل عدم دعم اسم المستخدم/كلمة المرور لا تدعم حرف واحد أسماء ، جزء معرفات كونها مكسورة).

يمكنك الحصول على جميع http/https, host, port, مسار وكذلك الاستعلام باستخدام Uri الكائن في .صافي.فقط مهمة صعبة هو كسر المضيف في المجال الفرعي, اسم نطاق TLD.

لا يوجد معيار القيام بذلك و لا يمكن ببساطة استخدام سلسلة تحليل أو RegEx لإنتاج النتيجة الصحيحة.في البداية, أنا باستخدام التعابير المنطقية وظيفة ولكن ليس كل عنوان URL يمكن تحليل فرعي بشكل صحيح.ممارسة طريقة هي استخدام قائمة نطاقات Tld.بعد TLD URL ويعرف الجزء الأيسر هو المجال المتبقي هو المجال الفرعي.

ومع ذلك قائمة تحتاج إلى الحفاظ عليه منذ نطاقات Tld الجديدة هو ممكن.اللحظة الراهنة أعرفه publicsuffix.org الحفاظ على أحدث قائمة يمكنك استخدام domainname محلل أدوات من جوجل كود تحليل العامة لاحقة القائمة على النطاق الفرعي المجال و TLD بسهولة باستخدام DomainName وجوه:domainName.فرعي ، domainName.المجال domainName.TLD.

هذه الإجابات أيضا مفيدا:الحصول على دومين فرعي من URL

CaLLMeLaNN

هنا هو واحد كاملة ، و لا تعتمد على أي بروتوكول.

function getServerURL(url) {
        var m = url.match("(^(?:(?:.*?)?//)?[^/?#;]*)");
        console.log(m[1]) // Remove this
        return m[1];
    }

getServerURL("http://dev.test.se")
getServerURL("http://dev.test.se/")
getServerURL("//ajax.googleapis.com/ajax/libs/jquery/1.8.3/jquery.min.js")
getServerURL("//")
getServerURL("www.dev.test.se/sdas/dsads")
getServerURL("www.dev.test.se/")
getServerURL("www.dev.test.se?abc=32")
getServerURL("www.dev.test.se#abc")
getServerURL("//dev.test.se?sads")
getServerURL("http://www.dev.test.se#321")
getServerURL("http://localhost:8080/sads")
getServerURL("https://localhost:8080?sdsa")

طباعة

http://dev.test.se

http://dev.test.se

//ajax.googleapis.com

//

www.dev.test.se

www.dev.test.se

www.dev.test.se

www.dev.test.se

//dev.test.se

http://www.dev.test.se

http://localhost:8080

https://localhost:8080

أي من أعلاه عملت بالنسبة لي.هنا هو ما انتهى باستخدام:

/^(?:((?:https?|s?ftp):)\/\/)([^:\/\s]+)(?::(\d*))?(?:\/([^\s?#]+)?([?][^?#]*)?(#.*)?)?/

أنا أحب regex التي نشرت في "جافا سكريبت:أجزاء جيدة".وليس قصيرة جدا و ليست معقدة جدا.هذه الصفحة على جيثب أيضا شفرة جافا سكريبت التي يستخدمها.ولكن فإنه يمكن تكييفها لأي لغة.https://gist.github.com/voodooGQ/4057330

جافا يوفر URL الدرجة من شأنها أن تفعل هذا. الاستعلام URL الكائنات.

على الجانب علما, PHP يقدم parse_url().

وأود أن لا يوصي باستخدام التعابير المنطقية.استدعاء واجهة برمجة التطبيقات مثل WinHttpCrackUrl() هو أقل عرضه للخطأ.

http://msdn.microsoft.com/en-us/library/aa384092%28VS.85%29.aspx

حاولت عدد قليل من هذه التي لم تغطي احتياجات بلدي ، وخاصة على أعلى صوت الذي لم يمسك url دون مسار (http://example.com/)

أيضا عدم وجود أسماء المجموعة جعلها غير صالحة للاستعمال في ansible (أو ربما jinja2 هي التي تفتقر إلى المهارات).

لذلك هذا هو نسخة معدلة قليلا مع مصدر كونها أعلى صوت النسخة هنا:

^((?P<protocol>http[s]?|ftp):\/)?\/?(?P<host>[^:\/\s]+)(?P<path>((\/\w+)*\/)([\w\-\.]+[^#?\s]+))*(.*)?(#[\w\-]+)?$

باستخدام http://www.fileformat.info/tool/regex.htm hometoast هو regex يعمل كبيرة.

ولكن هنا هو التعامل أريد أن استخدام مختلف أنماط regex في حالات مختلفة في البرنامج.

على سبيل المثال لدي هذا الرابط, و لدي التعداد أن القوائم ويدعم جميع عناوين url في البرنامج.كل كائن في تعداد لديه طريقة getRegexPattern أن ترجع regex نمط والتي سيتم استخدامها مقارنة مع عنوان URL.إذا كان خاص regex نمط ترجع صحيح ، ثم أعلم أن هذا العنوان هو يدعمها البرنامج.لذا كل التعداد نفسه regex اعتمادا على حيث ينبغي أن ننظر داخل URL.

Hometoast اقتراح رائع لكن في حالتي أنا أعتقد أنه لن يساعد (إلا إذا كنت نسخ لصق نفس التعابير المنطقية في جميع التعدادات).

هذا هو السبب في أنني أردت الجواب على إعطاء regex لكل حالة على حدة.على الرغم من +1 hometoast.;)

أعلم أنك تدعي اللغة-الملحد على هذا ، ولكن يمكن لك أن تقول لنا ما كنت تستخدم فقط حتى نعرف ما regex قدرات لديك ؟

إذا كان لديك قدرات غير التقاط المباريات ، يمكنك تعديل hometoast التعبير بحيث subexpressions أن كنت لا ترغب في التقاط يتم إعداد مثل هذا:

(?:SOMESTUFF)

كنت لا تزال بحاجة الى نسخ و لصق (و تعديل طفيف) في Regex في أماكن متعددة ، ولكن هذا منطقي ... أنت لست مجرد التحقق لمعرفة ما إذا كان subexpression موجودا ، بل إذا كان موجودا كجزء من URL.استخدام غير التقاط التعديل على subexpressions يمكن أن تعطيك ما تحتاج ولا شيء أكثر من ذلك،, إذا أنا أقرأ بشكل صحيح, هو ما تريد.

مثلما الصغيرة علما ، hometoast التعبير لا تحتاج إلى وضع أقواس حول 's' ل 'https' ، لأنه فقط حرف واحد في هناك.محددو الكمية تحديد حرف واحد (أو فئة حرف أو subexpression) السابقة مباشرة لها.لذلك:

https?

أن المباراة 'http' أو 'https' على ما يرام.

regexp للحصول على مسار URL بدون الملف.

url = 'http://domain/dir1/dir2/somefile' url.المسح الضوئي(/^(http://[^/]+)((?:/[^/]+)+(?=/))?/?(?:[^/]+)?$/أنا).to_s

فإنه يمكن أن يكون من المفيد إضافة المسار النسبي إلى عنوان url هذا.

String s = "https://www.thomas-bayer.com/axis2/services/BLZService?wsdl";

String regex = "(^http.?://)(.*?)([/\\?]{1,})(.*)";

System.out.println("1: " + s.replaceAll(regex, "$1"));
System.out.println("2: " + s.replaceAll(regex, "$2"));
System.out.println("3: " + s.replaceAll(regex, "$3"));
System.out.println("4: " + s.replaceAll(regex, "$4"));

سوف توفر الإخراج التالي:
1:https://
2:www.thomas-bayer.com
3: /
4:axis2/الخدمات/BLZService?wsdl

إذا قمت بتغيير عنوان URL
String s = "https://www.thomas-bayer.com?wsdl=qwerwer&ttt=888";سوف يكون الإخراج التالية :
1:https://
2:www.thomas-bayer.com
3: ?
4:wsdl=qwerwer&ttt=888

الاستمتاع..
يوزي ليف

على regex أن تفعل الكامل تحليل جدا البشعة.لقد تضمنت اسمه backreferences عن وضوح و كسر كل جزء في أسطر منفصلة, ولكن لا تزال تبدو مثل هذا:

^(?:(?P<protocol>\w+(?=:\/\/))(?::\/\/))?
(?:(?P<host>(?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^\/?#:]+)(?::(?P<port>[0-9]+))?)\/)?
(?:(?P<path>(?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^?#])+)\/)?
(?P<file>(?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^?#])+)
(?:\?(?P<querystring>(?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^#])+))?
(?:#(?P<fragment>.*))?$

الشيء الذي يتطلب أن يكون لذلك مطول هو أنه باستثناء البروتوكول أو الميناء ، أي من أجزاء يمكن أن تحتوي على HTML الكيانات ، مما يجعل ترسيم جزء صعبة جدا.حتى في الماضي بعض الحالات - المضيف, مسار, ملف, querystring ، جزء ، سمحنا أي html أو كيان أو أي حرف ليس ? أو #.على regex عن html الكيان تبدو مثل هذا:

$htmlentity = "&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);"

عندما يتم استخراج (كنت شارب الجملة تمثل ذلك) ، يصبح قليلا أكثر وضوحا:

^(?:(?P<protocol>(?:ht|f)tps?|\w+(?=:\/\/))(?::\/\/))?
(?:(?P<host>(?:{{htmlentity}}|[^\/?#:])+(?::(?P<port>[0-9]+))?)\/)?
(?:(?P<path>(?:{{htmlentity}}|[^?#])+)\/)?
(?P<file>(?:{{htmlentity}}|[^?#])+)
(?:\?(?P<querystring>(?:{{htmlentity}};|[^#])+))?
(?:#(?P<fragment>.*))?$

في جافا سكريبت, بالطبع, لا يمكنك استخدام اسمه backreferences حتى يصبح regex

^(?:(\w+(?=:\/\/))(?::\/\/))?(?:((?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^\/?#:]+)(?::([0-9]+))?)\/)?(?:((?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^?#])+)\/)?((?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^?#])+)(?:\?((?:(?:&(?:amp|apos|gt|lt|nbsp|quot|bull|hellip|[lr][ds]quo|[mn]dash|permil|\#[1-9][0-9]{1,3}|[A-Za-z][0-9A-Za-z]+);)|[^#])+))?(?:#(.*))?$

و في كل مباراة ، البروتوكول \1, المضيف هو \2, والميناء هو \3, المسار \4, الملف \5, querystring \6, و جزء \7.

//USING REGEX
/**
 * Parse URL to get information
 *
 * @param   url     the URL string to parse
 * @return  parsed  the URL parsed or null
 */
var UrlParser = function (url) {
    "use strict";

    var regx = /^(((([^:\/#\?]+:)?(?:(\/\/)((?:(([^:@\/#\?]+)(?:\:([^:@\/#\?]+))?)@)?(([^:\/#\?\]\[]+|\[[^\/\]@#?]+\])(?:\:([0-9]+))?))?)?)?((\/?(?:[^\/\?#]+\/+)*)([^\?#]*)))?(\?[^#]+)?)(#.*)?/,
        matches = regx.exec(url),
        parser = null;

    if (null !== matches) {
        parser = {
            href              : matches[0],
            withoutHash       : matches[1],
            url               : matches[2],
            origin            : matches[3],
            protocol          : matches[4],
            protocolseparator : matches[5],
            credhost          : matches[6],
            cred              : matches[7],
            user              : matches[8],
            pass              : matches[9],
            host              : matches[10],
            hostname          : matches[11],
            port              : matches[12],
            pathname          : matches[13],
            segment1          : matches[14],
            segment2          : matches[15],
            search            : matches[16],
            hash              : matches[17]
        };
    }

    return parser;
};

var parsedURL=UrlParser(url);
console.log(parsedURL);

مرخصة بموجب: CC-BY-SA مع الإسناد

لا تنتمي إلى StackOverflow