diff --git a/docs/02_concepts/05_proxy_management.mdx b/docs/02_concepts/05_proxy_management.mdx index a0effcce..400696a0 100644 --- a/docs/02_concepts/05_proxy_management.mdx +++ b/docs/02_concepts/05_proxy_management.mdx @@ -56,13 +56,13 @@ When no `session_id` is provided, your custom proxy URLs are rotated round-robin ### Apify proxy configuration -With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. This allows you to get better proxy performance after some initial research. +With Apify Proxy, you can select specific proxy groups to use, or countries to connect from. For even finer control, you can also target a specific subdivision (e.g. a US state) using the `subdivision_code` parameter alongside `country_code`. This allows you to get better proxy performance after some initial research. {ApifyProxyConfig} -Now your connections using proxy_url will use only Residential proxies from the US. Note that you must first get access to a proxy group before you are able to use it. You can find your available proxy groups in the [proxy dashboard](https://console.apify.com/proxy). +Now your connections using proxy_url will use only Residential proxies from California, US. The `subdivision_code` accepts a 1–3 character ISO 3166-2 code (e.g. `CA` for California) and currently only works for the United States (`country_code='US'`). Note that you must first get access to a proxy group before you are able to use it. You can find your available proxy groups in the [proxy dashboard](https://console.apify.com/proxy). If you don't specify any proxy groups, automatic proxy selection will be used. diff --git a/docs/02_concepts/code/05_apify_proxy_config.py b/docs/02_concepts/code/05_apify_proxy_config.py index 68f39e09..40dac738 100644 --- a/docs/02_concepts/code/05_apify_proxy_config.py +++ b/docs/02_concepts/code/05_apify_proxy_config.py @@ -8,6 +8,7 @@ async def main() -> None: proxy_cfg = await Actor.create_proxy_configuration( groups=['RESIDENTIAL'], country_code='US', + subdivision_code='CA', ) if not proxy_cfg: diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 015d78d6..9f4cd7ec 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -1315,6 +1315,7 @@ async def create_proxy_configuration( password: str | None = None, groups: list[str] | None = None, country_code: str | None = None, + subdivision_code: str | None = None, proxy_urls: list[str | None] | None = None, new_url_function: _NewUrlFunction | None = None, ) -> ProxyConfiguration | None: @@ -1332,6 +1333,8 @@ async def create_proxy_configuration( if available. groups: Proxy groups which the Apify Proxy should use, if provided. country_code: Country which the Apify Proxy should use, if provided. + subdivision_code: Subdivision (e.g. US state) which the Apify Proxy should use, if provided. + Requires `country_code` to be set. Two-letter ISO 3166-2 code (e.g. `CA` for California). proxy_urls: Custom proxy server URLs which should be rotated through. new_url_function: Function which returns a custom proxy URL to be used. @@ -1342,6 +1345,7 @@ async def create_proxy_configuration( if actor_proxy_input is not None: if actor_proxy_input.get('useApifyProxy', False): country_code = country_code or actor_proxy_input.get('apifyProxyCountry') + subdivision_code = subdivision_code or actor_proxy_input.get('apifyProxySubdivision') groups = groups or actor_proxy_input.get('apifyProxyGroups') else: proxy_urls = actor_proxy_input.get('proxyUrls', []) @@ -1352,6 +1356,7 @@ async def create_proxy_configuration( password=password, groups=groups, country_code=country_code, + subdivision_code=subdivision_code, proxy_urls=proxy_urls, new_url_function=new_url_function, _actor_config=self.configuration, diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index a654cdd8..10cf277f 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -27,6 +27,8 @@ APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$') COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$') +# ISO 3166-2 subdivision codes are 1-3 uppercase alphanumeric characters, e.g. 'CA', 'NSW', '9' (Wien, AT-9) +SUBDIVISION_CODE_REGEX = re.compile(r'^[A-Z0-9]{1,3}$') SESSION_ID_MAX_LENGTH = 50 @@ -89,6 +91,13 @@ class ProxyInfo(CrawleeProxyInfo): This parameter is optional, by default, the proxy uses all available proxy servers from all countries. """ + subdivision_code: str | None = None + """If set, the proxy will use IP addresses geolocated to the specified subdivision (e.g. US state). + Requires `country_code` to be set. The subdivision code must be a 1-3 character ISO 3166-2 code + consisting of uppercase letters and digits (e.g. `CA` for California). Currently only supported for + the United States (`country_code='US'`). + """ + @docs_group('Configuration') class ProxyConfiguration(CrawleeProxyConfiguration): @@ -111,6 +120,7 @@ def __init__( password: str | None = None, groups: list[str] | None = None, country_code: str | None = None, + subdivision_code: str | None = None, proxy_urls: list[str | None] | None = None, new_url_function: _NewUrlFunction | None = None, tiered_proxy_urls: list[list[str | None]] | None = None, @@ -126,6 +136,9 @@ def __init__( if available. groups: Proxy groups which the Apify Proxy should use, if provided. country_code: Country which the Apify Proxy should use, if provided. + subdivision_code: Subdivision (e.g. US state) which the Apify Proxy should use, if provided. + Requires `country_code` to be set. 1-3 character ISO 3166-2 code of uppercase letters/digits + (e.g. `CA` for California). proxy_urls: Custom proxy server URLs which should be rotated through. new_url_function: Function which returns a custom proxy URL to be used. tiered_proxy_urls: Proxy URLs arranged into tiers @@ -141,11 +154,17 @@ def __init__( country_code = str(country_code) _check(country_code, label='country_code', pattern=COUNTRY_CODE_REGEX) + if subdivision_code: + if not country_code: + raise ValueError('ProxyConfiguration: Cannot set "subdivision_code" without "country_code".') + subdivision_code = str(subdivision_code) + _check(subdivision_code, label='subdivision_code', pattern=SUBDIVISION_CODE_REGEX) + if (proxy_urls or new_url_function or tiered_proxy_urls) and (groups or country_code): raise ValueError( 'Cannot combine custom proxies with Apify Proxy!' ' It is not allowed to set "proxy_urls" or "new_url_function" combined with' - ' "groups" or "country_code".' + ' "groups", "country_code", or "subdivision_code".' ) if proxy_urls and any('apify.com' in (url or '') for url in proxy_urls): @@ -176,6 +195,7 @@ def __init__( self._groups = list(groups) if groups else [] self._country_code = country_code + self._subdivision_code = subdivision_code async def initialize(self) -> None: """Check if using proxy, if so, check the access. @@ -247,6 +267,7 @@ async def new_proxy_info( proxy_tier=proxy_info.proxy_tier, groups=self._groups, country_code=self._country_code or None, + subdivision_code=self._subdivision_code or None, ) return ProxyInfo( @@ -309,7 +330,10 @@ def _get_username(self, session_id: int | str | None = None) -> str: if session_id is not None: parts.append(f'session-{session_id}') if self._country_code: - parts.append(f'country-{self._country_code}') + if self._subdivision_code: + parts.append(f'country-{self._country_code}_{self._subdivision_code}') + else: + parts.append(f'country-{self._country_code}') if not parts: return 'auto' diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index b441af11..86622b71 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -146,7 +146,22 @@ def request_handler(request: Request, response: Response) -> Response: == f'http://groups-{"+".join(groups)},country-{country_code}:{DUMMY_PASSWORD}@proxy.apify.com:8000' ) - assert len(patched_apify_client.calls['user']['get']) == 2 # ty: ignore[unresolved-attribute] - assert call_mock.call_count == 2 + subdivision = 'CA' + proxy_configuration = await Actor.create_proxy_configuration( + actor_proxy_input={ + 'useApifyProxy': True, + 'apifyProxyGroups': groups, + 'apifyProxyCountry': country_code, + 'apifyProxySubdivision': subdivision, + } + ) + assert proxy_configuration is not None + assert ( + await proxy_configuration.new_url() + == f'http://groups-{"+".join(groups)},country-{country_code}_{subdivision}:{DUMMY_PASSWORD}@proxy.apify.com:8000' + ) + + assert len(patched_apify_client.calls['user']['get']) == 3 # ty: ignore[unresolved-attribute] + assert call_mock.call_count == 3 await Actor.exit() diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index e43d2f9e..e0f65f78 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -88,6 +88,15 @@ def test_invalid_arguments() -> None: with pytest.raises(ValueError, match=match_pattern): ProxyConfiguration(country_code=invalid_country_code) # ty: ignore[invalid-argument-type] + for invalid_subdivision_code in ['California', 'ca', 'ABCD', 'A1b']: + escaped = re.escape(str(invalid_subdivision_code)) + match_pattern = f'Value {escaped} of argument subdivision_code does not match pattern' + with pytest.raises(ValueError, match=match_pattern): + ProxyConfiguration(country_code='US', subdivision_code=invalid_subdivision_code) + + with pytest.raises(ValueError, match=r'Cannot set "subdivision_code" without "country_code"'): + ProxyConfiguration(subdivision_code='CA') + with pytest.raises(ValueError, match=r'Exactly one of .* must be specified'): ProxyConfiguration( proxy_urls=['http://proxy.com:1111'], @@ -105,6 +114,9 @@ def test_invalid_arguments() -> None: new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222', groups=['GROUP1'] ) + with pytest.raises(ValueError, match=r'Cannot combine custom proxies with Apify Proxy'): + ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], country_code='US', subdivision_code='CA') + async def test_new_url_basic() -> None: groups = ['GROUP1', 'GROUP2'] @@ -124,6 +136,26 @@ async def test_new_url_basic() -> None: assert proxy_url == f'http://{expected_username}:{password}@{expected_hostname}:{expected_port}' +async def test_new_url_with_subdivision() -> None: + groups = ['RESIDENTIAL'] + password = 'abcd1234' + country_code = 'US' + subdivision = 'CA' + proxy_configuration = ProxyConfiguration( + groups=groups, + password=password, + country_code=country_code, + subdivision_code=subdivision, + ) + proxy_url = await proxy_configuration.new_url() + + expected_username = f'groups-{"+".join(groups)},country-{country_code}_{subdivision}' + expected_hostname = 'proxy.apify.com' + expected_port = 8000 + + assert proxy_url == f'http://{expected_username}:{password}@{expected_hostname}:{expected_port}' + + async def test_new_url_with_session_ids() -> None: groups = ['GROUP1', 'GROUP2'] password = 'abcd1234' @@ -287,6 +319,7 @@ async def test_new_proxy_info_basic_construction() -> None: 'port': expected_port, 'groups': groups, 'country_code': country_code, + 'subdivision_code': None, 'username': expected_username, 'password': password, 'proxy_tier': None,