208 lines
7.4 KiB
Python
208 lines
7.4 KiB
Python
import os
|
||
import time
|
||
import random
|
||
import string
|
||
from selenium import webdriver
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.chrome.service import Service
|
||
from webdriver_manager.chrome import ChromeDriverManager
|
||
|
||
SEEN_LINKS_FILE = "seen_links.txt"
|
||
|
||
GENRE_URLS = [
|
||
"https://play.max.com/genre/action",
|
||
"https://play.max.com/genre/comedy",
|
||
"https://play.max.com/genre/drama",
|
||
"https://play.max.com/genre/horror",
|
||
"https://play.max.com/genre/sci-fi",
|
||
"https://play.max.com/genre/documentary"
|
||
]
|
||
|
||
def setup_browser():
|
||
options = Options()
|
||
options.add_argument("--disable-gpu")
|
||
options.add_argument("--window-size=1920,1080")
|
||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||
|
||
def scroll_to_bottom(driver, max_wait=60):
|
||
print("\U0001f4dc Scrolling to load all movies...")
|
||
last_height = driver.execute_script("return document.body.scrollHeight")
|
||
start_time = time.time()
|
||
|
||
while True:
|
||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||
time.sleep(2.5)
|
||
new_height = driver.execute_script("return document.body.scrollHeight")
|
||
if new_height == last_height:
|
||
print("\u2705 Reached bottom of page.")
|
||
break
|
||
last_height = new_height
|
||
if time.time() - start_time > max_wait:
|
||
print("\u23f1\ufe0f Scroll timed out.")
|
||
break
|
||
|
||
def extract_movie_links(driver):
|
||
anchors = driver.find_elements(By.XPATH, '//a[contains(@href, "/movie/")]')
|
||
links = list(set([a.get_attribute("href") for a in anchors if "/movie/" in a.get_attribute("href")]))
|
||
return links
|
||
|
||
def generate_random_query():
|
||
letters = string.ascii_lowercase
|
||
return ''.join(random.choices(letters, k=random.choice([1, 2])))
|
||
|
||
def load_seen_links():
|
||
if not os.path.exists(SEEN_LINKS_FILE):
|
||
return set()
|
||
with open(SEEN_LINKS_FILE, "r") as f:
|
||
return set([line.strip() for line in f if line.strip()])
|
||
|
||
def save_seen_links(new_links):
|
||
with open(SEEN_LINKS_FILE, "a") as f:
|
||
for link in new_links:
|
||
f.write(link + "\n")
|
||
|
||
def save_links(links, batch_id, prefix):
|
||
output_dir = "output_movies"
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
for i in range(0, len(links), 10):
|
||
chunk = links[i:i+10]
|
||
file_path = os.path.join(output_dir, f"{prefix}_{batch_id}_{(i // 10) + 1}.txt")
|
||
with open(file_path, "w") as f:
|
||
for link in chunk:
|
||
f.write(link + "\n")
|
||
print(f"\U0001f4be Saved {len(chunk)} links to {file_path}")
|
||
|
||
def run_movies_mode(driver, seen_links, batch_counter):
|
||
driver.get("https://play.max.com/movies")
|
||
time.sleep(4)
|
||
scroll_to_bottom(driver)
|
||
all_links = extract_movie_links(driver)
|
||
print(f"\u2705 Loaded {len(all_links)} movie links.")
|
||
|
||
while True:
|
||
try:
|
||
count = int(input("\U0001f522 How many random movies do you want to extract? "))
|
||
break
|
||
except ValueError:
|
||
print("\u274c Invalid number.")
|
||
|
||
sample = random.sample(all_links, min(count, len(all_links)))
|
||
unique_links = [link for link in sample if link not in seen_links]
|
||
duplicates = len(sample) - len(unique_links)
|
||
|
||
if unique_links:
|
||
save_links(unique_links, batch_counter, "movies")
|
||
save_seen_links(unique_links)
|
||
print(f"\u2705 Saved {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
|
||
else:
|
||
print("\u26a0\ufe0f All selected links were duplicates.")
|
||
|
||
def run_search_mode(driver, seen_links, search_counter):
|
||
search_terms = [
|
||
"the", "man", "love", "dark", "moon", "fire", "red", "blue", "night",
|
||
"girl", "life", "death", "dream", "war", "blood", "star", "light", "king", "queen"
|
||
]
|
||
max_attempts = 10
|
||
found_total = 0
|
||
|
||
for attempt in range(max_attempts):
|
||
query = random.choice(search_terms)
|
||
url = f"https://play.max.com/search?q={query}"
|
||
print(f"\n\U0001f50d Searching for query: '{query}'")
|
||
driver.get(url)
|
||
time.sleep(4)
|
||
|
||
links = extract_movie_links(driver)
|
||
unique_links = [link for link in links if link not in seen_links]
|
||
duplicates = len(links) - len(unique_links)
|
||
|
||
if unique_links:
|
||
save_links(unique_links, search_counter, "search")
|
||
save_seen_links(unique_links)
|
||
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
|
||
search_counter += 1
|
||
found_total += len(unique_links)
|
||
else:
|
||
print(f"\u26a0\ufe0f No new links found for query '{query}'.")
|
||
|
||
print("\u23f3 Waiting 10 seconds before next search...")
|
||
time.sleep(10)
|
||
|
||
print(f"\n\ud83d\udcca Search complete. Found total {found_total} new movie links in {max_attempts} searches.")
|
||
|
||
def run_genre_mode(driver, seen_links, genre_batch):
|
||
found_total = 0
|
||
for genre_url in GENRE_URLS:
|
||
print(f"\n\U0001f3ac Loading genre: {genre_url}")
|
||
driver.get(genre_url)
|
||
time.sleep(4)
|
||
scroll_to_bottom(driver)
|
||
links = extract_movie_links(driver)
|
||
unique_links = [link for link in links if link not in seen_links]
|
||
duplicates = len(links) - len(unique_links)
|
||
|
||
if unique_links:
|
||
save_links(unique_links, genre_batch, "genre")
|
||
save_seen_links(unique_links)
|
||
print(f"\u2705 Found {len(unique_links)} new links. \u267b Skipped {duplicates} duplicates.")
|
||
genre_batch += 1
|
||
found_total += len(unique_links)
|
||
else:
|
||
print("\u26a0\ufe0f No new unique links in this genre.")
|
||
time.sleep(5)
|
||
|
||
print("📊 Genre scan complete. Found total", found_total, "new movie links.")
|
||
|
||
def main():
|
||
print("🛠️ Max.com Movie Scraper — Created by Mike | DRMLab.io Project")
|
||
driver = setup_browser()
|
||
|
||
try:
|
||
print("\U0001f310 Opening Max.com homepage...")
|
||
driver.get("https://play.max.com")
|
||
time.sleep(3)
|
||
|
||
print("\n\U0001f511 Please log in manually in the browser.")
|
||
input("\u23f3 After you're logged in, press ENTER here to continue...")
|
||
|
||
seen_links = load_seen_links()
|
||
movie_batch = 1
|
||
search_batch = 1
|
||
genre_batch = 1
|
||
|
||
while True:
|
||
print("📋 === MAIN MENU ===")
|
||
print("1️⃣ Extract movies from /movies page")
|
||
print("2️⃣ Extract random movies via search")
|
||
print("3️⃣ Extract movies by genre")
|
||
print("4️⃣ Exit")
|
||
choice = input("Enter your choice: ").strip()
|
||
|
||
if choice == "1":
|
||
run_movies_mode(driver, seen_links, movie_batch)
|
||
movie_batch += 1
|
||
elif choice == "2":
|
||
run_search_mode(driver, seen_links, search_batch)
|
||
search_batch += 1
|
||
elif choice == "3":
|
||
run_genre_mode(driver, seen_links, genre_batch)
|
||
genre_batch += 1
|
||
elif choice == "4":
|
||
break
|
||
else:
|
||
print("\u274c Invalid choice.")
|
||
|
||
finally:
|
||
again = input("⏹ Do you want to close the browser? (yes/no): ").strip().lower()
|
||
if again != "yes":
|
||
main()
|
||
else:
|
||
input("✅ Press ENTER to exit.")
|
||
driver.quit()
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|